<a href="https://colab.research.google.com/github/andrewwells991/mds_thesis/blob/main/7_pixel_ob_cleaning_merging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Main script for cleaning and joining pixel data (from Earth Engine) and buildings data (from Open Buildings). Change project and site numbers for every run of script
- Pixel data is downloaded from Google Earth Engine and stored in 2_ee_pixel_data in Google Drive
- Open buildings data is downloaded from awells@ptfund.org Colab onto computer and uploaded into Drive folder: 3_raw_open_buildings_data

Once files are cleaned and joined, save to folder: 4_combined_pixel_ob.

This data can then be run through ML scripts.

Notes:
- For proj_2_18, use Septmber (August too cloudy)
- For proj_2_24, use Septmber (August too cloudy)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#load cleaning packages
import pandas as pd
import re

#Load geo packages
import geopandas as gpd
from shapely.geometry import Point, Polygon
from geopandas import GeoDataFrame

In [None]:
#Install geopandas
!pip install geopandas

Import pixel and open_buildings data

In [None]:
#Pixel data, data imported into pixel_data folder from Google Earth Engine
proj_2_11_aug_21_pixel = pd.read_csv('/content/gdrive/MyDrive/mds_thesis/2_ee_pixel_data/proj_2_11_aug_21_pixel.csv')

#Open Buildings data for that ROI, data downloaded onto computer and uploaded in Drive 3_raw_site_data folder
proj_2_11_open_buildings = pd.read_csv('/content/gdrive/MyDrive/mds_thesis/3_raw_open_buildings_data/proj_2_11_open_buildings.csv')

Cleaning data

In [None]:
# Adding index to pixel data (to remove duplicates later, when merged)
proj_2_11_aug_21_pixel = proj_2_11_aug_21_pixel.assign(row=range(1, len(proj_2_11_aug_21_pixel)+1))

In [None]:
#Viewing number of pixels within ROI
#proj_2_11_aug_21_pixel

In [None]:
# Cleaning open buildings data
# Removing word POLYGON
proj_2_11_open_buildings['geometry'] = proj_2_11_open_buildings['geometry'].str.replace('POLYGON', '')

# Removing first two parentheses
proj_2_11_open_buildings['geometry'] = proj_2_11_open_buildings['geometry'].astype(str).apply(lambda x: re.sub('\(', '', x))
proj_2_11_open_buildings['geometry'] = proj_2_11_open_buildings['geometry'].astype(str).apply(lambda x: re.sub('\(', '', x))

# Removing last two parentheses
proj_2_11_open_buildings['geometry'] = proj_2_11_open_buildings['geometry'].astype(str).str[:-2]

# Removing commas
proj_2_11_open_buildings['geometry'] = proj_2_11_open_buildings['geometry'].astype(str).str.replace(',', '')

Points in Polygons

In [None]:
#Converting pixel df (with POINTS) to GeoDataFrame
geometry = [Point(xy) for xy in zip(proj_2_11_aug_21_pixel.longitude, proj_2_11_aug_21_pixel.latitude)]

proj_2_11_aug_21_pixel_geo = proj_2_11_aug_21_pixel.drop(['longitude', 'latitude'], axis=1)
proj_2_11_aug_21_geo = GeoDataFrame(proj_2_11_aug_21_pixel_geo, crs="EPSG:4326", geometry=geometry)

In [None]:
#Function to convert buildings df to POLYGONS
def polygons_from_custom_xy_string(df_column):
    def chunks(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
    def xy_list_from_string(s):
        return list(chunks([float(i) for i in s.split()], 2))
    def poly(s):
        ps = xy_list_from_string(s)
        return Polygon([[p[0], p[1]] for p in ps])
    polygons = [poly(r) for r in df_column]
    return polygons

In [None]:
#Converting buildings df (with POLYGONS) to GeoDataFrame
proj_2_11_open_buildings_geo = gpd.GeoDataFrame(proj_2_11_open_buildings, geometry=polygons_from_custom_xy_string(proj_2_11_open_buildings["geometry"]))

In [None]:
#Joining dataframes based on points (proj_2_11_aug_21_geo) into polygons (proj_2_11_open_buildings_geo)
proj_2_11_aug_21_building_pixels = gpd.sjoin(proj_2_11_aug_21_geo, proj_2_11_open_buildings_geo, op='intersects', how='inner')

  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  proj_2_11_aug_21_building_pixels = gpd.sjoin(proj_2_11_aug_21_geo, proj_2_11_open_buildings_geo, op='intersects', how='inner')


In [None]:
#Viewing number of pixels that are part of a building
#proj_2_11_aug_21_building_pixels

Joining builing and non-building pixels

In [None]:
proj_2_11_aug_21_pixel.columns

Index(['B2', 'B3', 'B4', 'B8', 'B11', 'B12', 'latitude', 'longitude', 'row'], dtype='object')

In [None]:
proj_2_11_aug_21_building_pixels.columns

Index(['B2', 'B3', 'B4', 'B8', 'B11', 'B12', 'row', 'geometry', 'index_right',
       'Unnamed: 0', 'latitude', 'longitude', 'area_in_meters', 'confidence',
       'full_plus_code'],
      dtype='object')

In [None]:
#Dropping Unnamed column from proj_2_11_aug_21_building_pixel df
proj_2_11_aug_21_building_pixels = proj_2_11_aug_21_building_pixels.drop(columns=['Unnamed: 0'])

In [None]:
#Adding building column with value 1 to proj_2_11_aug_21_building_pixel DataFrame
proj_2_11_aug_21_building_pixel = proj_2_11_aug_21_building_pixels.assign(building = 1)

#Adding building column with value 0 and other new columns to proj_2_11_aug_21_pixel DataFrame
proj_2_11_aug_21_pixel = proj_2_11_aug_21_pixel.assign(geometry = None, index_right = None, area_in_meters = None,
                                                                 confidence = None, full_plus_code = None, building = 0)


In [None]:
#Combined dataframes
proj_2_11_aug_21_pixel_buildings_full = pd.concat([proj_2_11_aug_21_building_pixel, proj_2_11_aug_21_pixel], axis=0, ignore_index=True)



In [None]:
# Removing duplicate rows from the combined data frame
proj_2_11_aug_21_pixel_buildings_cleaned = proj_2_11_aug_21_pixel_buildings_full.drop_duplicates(subset=['row'])

In [None]:
#Setting longitude and latitude to numeric values
proj_2_11_aug_21_pixel_buildings_cleaned['longitude'] = pd.to_numeric(proj_2_11_aug_21_pixel_buildings_cleaned['longitude'])
proj_2_11_aug_21_pixel_buildings_cleaned['latitude'] = pd.to_numeric(proj_2_11_aug_21_pixel_buildings_cleaned['latitude'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [None]:
df = proj_2_11_aug_21_pixel_buildings_cleaned

In [None]:
#Viewing number of rows in dataframe (should match number of pixels within ROI)
df.shape

(19945, 15)

Save to Google Drive for ML processing

In [None]:
path = '/content/gdrive/My Drive/mds_thesis/4_combined_pixel_ob/proj_2_11_aug_21_combined_pixel_ob.csv'

with open(path, 'w', encoding = 'utf-8-sig') as f:  
  df.to_csv(f)