# Notebook to Clean Data

2019 - 01 - 21 updated 2019-04-30
Notebook to clean final osm cities and towns
1. merge files
2. drop duplicates 


In [1]:
import geopandas as gpd
import pandas as pd
import fiona
import json
from shapely.geometry import Point

In [2]:
import os
os.getcwd()

'/tana-crunch/cascade/projects/Pop-ERL/notebooks/jupyter/ERL19'

In [3]:
# File Paths

ERLv2_data = '../../../temp_data/ERL19v2/'
temp_data = '../../../temp_data/ERL19/'

### Merge OSM Files

In [None]:
def load_points (file):
    """ This function loads a csv 
    of points and turns it into shapely points"""
    df = pd.read_csv(file)

    # creating a geometry column 
    geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]

    # Coordinate reference system : WGS84
    crs = {'init': 'epsg:4326'}

    # Creating a Geographic data frame 
    point_gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    
    return point_gdf

In [None]:
# Load Files
town = load_points(v2_data+'20190114_osm_africa_towns.csv')
city = load_points(v2_data+'20190114_osm_africa_cities.csv')

townSS = load_points(v2_data+'20190221_osm_S_Sudan_towns.csv')
citySS = load_points(v2_data+'20190221_osm_S_Sudan_cities.csv')

townDRC = load_points(v2_data+'20190430_osm_DRC_towns.csv')
cityDRC = load_points(v2_data+'20190430_osm_DRC_cities.csv')

In [None]:
# add column with osm type

town['osm_type'] = 'town'
townDRC['osm_type'] = 'town'
townSS['osm_type'] = 'town'

city['osm_type'] = 'city'
cityDRC['osm_type'] = 'city'
citySS['osm_type'] = 'city'

In [None]:
cityDRC.head()

In [None]:
# merge

towns = pd.concat([town, townDRC, townSS, cityDRC, citySS], sort = False) # col name of cityDRC and citySS is town, not city

towns.rename(columns={'town':'osm_name'}, inplace=True)
city.rename(columns={'city':'osm_name'}, inplace=True)

out = pd.concat([towns, city], sort = False)


In [None]:
out[1000:1100]

In [None]:
out.shape

In [None]:
out.to_file(v2_data+"20190430_osm_All.shp", driver = "ESRI Shapefile")

### Split 1500c300 Polygons

In [None]:
# Load

GHS2000 = gpd.read_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300.shp')


In [None]:
GHS2000.head()

In [None]:
# ADD FIDS if needed

FID = list(range(len(GHS2000)))
GHS2000['DN'] = FID
GHS2000.columns.values[0] = "FID"
GHS2000.head()

In [None]:
len(GHS2000)

In [None]:
GHS2000_A = GHS2000[0:15000]
len(GHS2000_A)

In [None]:
GHS2000_B = GHS2000[15000:30000]
len(GHS2000_B)

In [None]:
GHS2000_C = GHS2000[30000:]
len(GHS2000_C)

In [None]:
len(GHS2000_A)+len(GHS2000_B)+len(GHS2000_C)

In [None]:
GHS2000_A.tail()

In [None]:
GHS2000_C.head()

In [None]:
GHS2000_A.to_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_A.shp')
GHS2000_B.to_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_B.shp')
GHS2000_C.to_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_C.shp')

### Merge PolyPoints 
- Merge polypoints outputs & save
- Find FIDs that overlap boarders

In [15]:
# file paths

poly_A = gpd.read_file(ERLv2_data+'GHS_POP_GPW42015_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_A_polypoints.shp')
poly_B = gpd.read_file(ERLv2_data+'GHS_POP_GPW42015_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_B_polypoints.shp')
poly_C = gpd.read_file(ERLv2_data+'GHS_POP_GPW42015_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_C_polypoints.shp')

out_data = 'GHS2015'

In [16]:
poly = pd.concat([poly_A, poly_B, poly_C], ignore_index = True)

In [17]:
len(poly)

6684

In [18]:
# Save out all Polygons

poly.to_file(ERLv2_data+out_data+'_polypoints_ALL.shp')

In [19]:
## Code to find polygons that overlap boarders

test = poly[poly.duplicated(subset=['FID'], keep=False)] # Keep all dup FIDS
test['dup'] = test['FID'].astype(str) + poly['country'] # Make new col with FID-Country
test.head()

out = test.drop_duplicates('dup', keep=False) # Drop all doubles FID-Country
print(len(out))

out = out.drop_duplicates('FID', keep='first') # Drop all left double FIDs
print(len(out))

49
26


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [20]:
out

Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,geometry,dup
325,490570861,6585,Algeria,Marsa Ben M'Hidi,town,35.081799,-2.204397,"POLYGON ((-198594.547444 4234029.461868, -1975...",6585Algeria
326,490570863,6910,Algeria,Boukanoun,town,34.97477,-2.099585,"POLYGON ((-186594.547444 4221029.461868, -1855...",6910Algeria
1381,262107914,10006,Tanzania,Tunduma TOWN,town,-9.310164,32.767525,"POLYGON ((3258405.452556 -1149970.538132, 3258...",10006Tanzania
2727,1623947997,16880,Morocco,Aousserd ?????? ?????,town,22.567,-14.312906,"POLYGON ((-1365594.547444 2763029.461868, -136...",16880Morocco
2839,1150882019,27492,Ethiopia,Dolo,town,4.17861,42.05889,"POLYGON ((4210405.452556 515029.461868, 421040...",27492Ethiopia
2887,44929726,27371,Kenya,Mandera,town,3.938421,41.857324,"POLYGON ((4189405.452556 484029.461868, 418940...",27371Kenya
2919,45006358,22328,Kenya,Busia,town,0.464348,34.111081,"POLYGON ((3425405.452556 50029.461868, 3425405...",22328Kenya
3056,293813047,19162,Rwanda,Gatuna,town,-1.4336,30.015113,"POLYGON ((3006405.452556 -174970.538132, 30074...",19162Rwanda
3057,435390255,16849,Rwanda,Cyangugu,town,-2.488777,28.895813,"POLYGON ((2891405.452556 -316970.538132, 28914...",16849Rwanda
3194,435662822,18136,Uganda,Kisoro,town,-1.282214,29.692666,"POLYGON ((2934405.452556 -223970.538132, 29364...",18136Uganda


In [21]:
# Drop Western Sahara / Morocco and South Sudan / Sudan
out = out[out['country'] != 'Sudan']

print(len(out))

out = out[out['country'] != 'Morocco']
print(len(out))

# out = out[out.duplicated(subset=['FID'], keep=False)] # Keep all duplicated FIDS but remove any left overs


25
21


In [23]:
out.to_file(ERLv2_data+out_data+'_polypoints_countryoverlap.shp')
