<a href="https://colab.research.google.com/github/andrewwells991/mds_thesis/blob/main/7_pixel_ob_cleaning_merging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Main script for cleaning and joining pixel data (from Earth Engine) and buildings data (from Open Buildings). Change project and site numbers for every run of script
- Pixel data is downloaded from Google Earth Engine and stored in 2_ee_pixel_data in Google Drive
- Open buildings data is downloaded from awells@ptfund.org Colab onto computer and uploaded into Drive folder: 3_raw_open_buildings_data

Once files are cleaned and joined, save to folder: 4_combined_pixel_ob.

This data can then be run through ML scripts.

In [955]:
'''
from google.colab import drive
drive.mount('/content/gdrive')
'''

"\nfrom google.colab import drive\ndrive.mount('/content/gdrive')\n"

In [956]:
'''
#Install geopandas
!pip install geopandas
'''

'\n#Install geopandas\n!pip install geopandas\n'

In [957]:
'''
#load cleaning packages
import pandas as pd
import re

#Load geo packages
import geopandas as gpd
from shapely.geometry import Point, Polygon
from geopandas import GeoDataFrame
'''

'\n#load cleaning packages\nimport pandas as pd\nimport re\n\n#Load geo packages\nimport geopandas as gpd\nfrom shapely.geometry import Point, Polygon\nfrom geopandas import GeoDataFrame\n'

Import pixel and open_buildings data

In [958]:
data_ob = 'https://raw.githubusercontent.com/andrewwells991/mds_thesis/main/data/1_open_buildings/proj_7_12_open_buildings.csv'
df_ob = pd.read_csv(data_ob)

data_pix = 'https://raw.githubusercontent.com/andrewwells991/mds_thesis/main/data/2_ee_pixel_data/malawi_proj_7_12_yr_21_pixel.csv'
df_pix = pd.read_csv(data_pix)

Cleaning data

In [959]:
# Adding index to pixel data (to remove duplicates later, when merged)
df_pix = df_pix.assign(row=range(1, len(df_pix)+1))

In [960]:
#Viewing number of pixels within ROI
#df_pix

In [961]:
#df_ob

In [962]:
# Cleaning open buildings data
# Removing word POLYGON
df_ob['geometry'] = df_ob['geometry'].str.replace('POLYGON', '')

# Removing first two parentheses
df_ob['geometry'] = df_ob['geometry'].astype(str).apply(lambda x: re.sub('\(', '', x))
df_ob['geometry'] = df_ob['geometry'].astype(str).apply(lambda x: re.sub('\(', '', x))

# Removing last two parentheses
df_ob['geometry'] = df_ob['geometry'].astype(str).str[:-2]

# Removing commas
df_ob['geometry'] = df_ob['geometry'].astype(str).str.replace(',', '')

Points in Polygons

In [963]:
#Converting pixel df (with POINTS) to GeoDataFrame
geometry = [Point(xy) for xy in zip(df_pix.longitude, df_pix.latitude)]

df_pix_geo = df_pix.drop(['longitude', 'latitude'], axis=1)
df_pix_geo = GeoDataFrame(df_pix_geo, crs="EPSG:4326", geometry=geometry)

In [964]:
#Function to convert buildings df to POLYGONS
def polygons_from_custom_xy_string(df_column):
    def chunks(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]
    def xy_list_from_string(s):
        return list(chunks([float(i) for i in s.split()], 2))
    def poly(s):
        ps = xy_list_from_string(s)
        return Polygon([[p[0], p[1]] for p in ps])
    polygons = [poly(r) for r in df_column]
    return polygons

In [965]:
#Certain Open Buildings dataframes need to have special charaters removed (try these if getting an error in next line)
  #df_ob['geometry'] = df_ob['geometry'].str.replace(')', '', regex=True)
  #df_ob['geometry'] = df_ob['geometry'].str.replace('MULTI', '', regex=True)

In [966]:
#Converting buildings df (with POLYGONS) to GeoDataFrame
df_ob_geo = gpd.GeoDataFrame(df_ob, geometry = polygons_from_custom_xy_string(df_ob["geometry"]))

In [967]:
#Joining dataframes based on points (df_ob_geo) into polygons (df_pix_geo)
df_ob_pix = gpd.sjoin(df_pix_geo, df_ob_geo, op='intersects', how='inner')

  if (await self.run_code(code, result,  async_=asy)):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  df_ob_pix = gpd.sjoin(df_pix_geo, df_ob_geo, op='intersects', how='inner')


In [968]:
#Viewing number of pixels that are part of a building
#df_ob_pix

Joining builing and non-building pixels

In [969]:
#Dropping Unnamed column from df_ob_pix
df_ob_pix = df_ob_pix.drop(columns=['Unnamed: 0'])

In [970]:
#Adding building column with value 1 to df_ob_pix DataFrame
df_ob_pix = df_ob_pix.assign(building = 1)

#Adding building column with value 0 and other new columns to df_pix DataFrame
df_pix = df_pix.assign(geometry = None, index_right = None, area_in_meters = None,
                                                                 confidence = None, full_plus_code = None, building = 0)


In [971]:
#Combined dataframes
df_ob_pix_full = pd.concat([df_ob_pix, df_pix], axis=0, ignore_index=True)



In [972]:
# Removing duplicate rows from the combined data frame
df_ob_pix_full_cleaned = df_ob_pix_full.drop_duplicates(subset=['row'])

In [973]:
#Add project variable for reference
df_ob_pix_full_cleaned['project'] = 'proj_7_12'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [974]:
#Setting longitude and latitude to numeric values
df_ob_pix_full_cleaned['longitude'] = pd.to_numeric(df_ob_pix_full_cleaned['longitude'])
df_ob_pix_full_cleaned['latitude'] = pd.to_numeric(df_ob_pix_full_cleaned['latitude'])

In [975]:
#Viewing number of rows in dataframe (should match number of pixels within ROI)
df_ob_pix_full_cleaned.shape

(39870, 16)

In [976]:
df_ob_pix_full_cleaned.columns

Index(['B2', 'B3', 'B4', 'B8', 'B11', 'B12', 'row', 'geometry', 'index_right',
       'latitude', 'longitude', 'area_in_meters', 'confidence',
       'full_plus_code', 'building', 'project'],
      dtype='object')

Save to Google Drive for ML processing

In [977]:
path = '/content/gdrive/My Drive/mds_thesis/4_combined_pixel_ob/proj_7_12_yr_21_combined_pixel_ob.csv'

with open(path, 'w', encoding = 'utf-8-sig') as f:  
  df_ob_pix_full_cleaned.to_csv(f)

In [978]:
df_ob_pix_full_cleaned

Unnamed: 0,B2,B3,B4,B8,B11,B12,row,geometry,index_right,latitude,longitude,area_in_meters,confidence,full_plus_code,building,project
0,1052.0,939.0,867.0,2234.0,2728.0,1926.0,13,POINT (35.25379 -16.91101),2462,-16.911041,35.253793,53.0739,0.8047,5GMQ37Q3+HGM7,1,proj_7_12
1,1314.0,1273.0,1519.0,2215.0,2703.0,2161.0,82,POINT (35.25332 -16.91111),1388,-16.911113,35.253341,23.7991,0.7488,5GMQ37Q3+H84H,1,proj_7_12
2,1150.0,1172.0,1337.0,2278.0,2759.0,1824.0,130,POINT (35.25782 -16.91106),1802,-16.911061,35.257838,95.5751,0.8212,5GMQ37Q5+H4GH,1,proj_7_12
3,1233.0,1148.0,1300.0,1997.0,2521.0,1768.0,144,POINT (35.25914 -16.91105),127,-16.911054,35.259099,83.1112,0.6754,5GMQ37Q5+HJHR,1,proj_7_12
4,1137.0,1122.0,1178.0,2285.0,2660.0,2044.0,199,POINT (35.26430 -16.91099),824,-16.910985,35.264305,45.5645,0.8195,5GMQ37Q7+JP3Q,1,proj_7_12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41490,1098.0,1072.0,858.0,2204.0,1866.0,1023.0,39866,,,-16.928891,35.270986,,,,0,proj_7_12
41491,1120.0,1061.0,916.0,2078.0,2055.0,1194.0,39867,,,-16.928890,35.271080,,,,0,proj_7_12
41492,1126.0,1103.0,896.0,2364.0,2055.0,1194.0,39868,,,-16.928889,35.271173,,,,0,proj_7_12
41493,1072.0,1086.0,792.0,2588.0,1900.0,1006.0,39869,,,-16.928888,35.271267,,,,0,proj_7_12
