# Notebook to Clean Data

2019 - 01 - 21 updated 2019-04-30
Notebook to clean final osm cities and towns
1. merge files
2. drop duplicates 


In [1]:
import geopandas as gpd
import pandas as pd
import fiona
import json
from shapely.geometry import Point

In [2]:
import os
os.getcwd()

'/Users/cascade/Github/Pop-ERL/notebooks/jupyter/ERL19'

In [3]:
# File Paths

ERLv2_data = '../../../temp_data/ERL19v2/'
temp_data = '../../../temp_data/ERL19/'

### Merge OSM Files

In [None]:
def load_points (file):
    """ This function loads a csv 
    of points and turns it into shapely points"""
    df = pd.read_csv(file)

    # creating a geometry column 
    geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]

    # Coordinate reference system : WGS84
    crs = {'init': 'epsg:4326'}

    # Creating a Geographic data frame 
    point_gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    
    return point_gdf

In [None]:
# Load Files
town = load_points(v2_data+'20190114_osm_africa_towns.csv')
city = load_points(v2_data+'20190114_osm_africa_cities.csv')

townSS = load_points(v2_data+'20190221_osm_S_Sudan_towns.csv')
citySS = load_points(v2_data+'20190221_osm_S_Sudan_cities.csv')

townDRC = load_points(v2_data+'20190430_osm_DRC_towns.csv')
cityDRC = load_points(v2_data+'20190430_osm_DRC_cities.csv')

In [None]:
# add column with osm type

town['osm_type'] = 'town'
townDRC['osm_type'] = 'town'
townSS['osm_type'] = 'town'

city['osm_type'] = 'city'
cityDRC['osm_type'] = 'city'
citySS['osm_type'] = 'city'

In [None]:
cityDRC.head()

In [None]:
# merge

towns = pd.concat([town, townDRC, townSS, cityDRC, citySS], sort = False) # col name of cityDRC and citySS is town, not city

towns.rename(columns={'town':'osm_name'}, inplace=True)
city.rename(columns={'city':'osm_name'}, inplace=True)

out = pd.concat([towns, city], sort = False)


In [None]:
out[1000:1100]

In [None]:
out.shape

In [None]:
out.to_file(v2_data+"20190430_osm_All.shp", driver = "ESRI Shapefile")

### Split 1500c300 Polygons

In [None]:
# Load

GHS2000 = gpd.read_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300.shp')


In [None]:
GHS2000.head()

In [None]:
# ADD FIDS if needed

FID = list(range(len(GHS2000)))
GHS2000['DN'] = FID
GHS2000.columns.values[0] = "FID"
GHS2000.head()

In [None]:
len(GHS2000)

In [None]:
GHS2000_A = GHS2000[0:15000]
len(GHS2000_A)

In [None]:
GHS2000_B = GHS2000[15000:30000]
len(GHS2000_B)

In [None]:
GHS2000_C = GHS2000[30000:]
len(GHS2000_C)

In [None]:
len(GHS2000_A)+len(GHS2000_B)+len(GHS2000_C)

In [None]:
GHS2000_A.tail()

In [None]:
GHS2000_C.head()

In [None]:
GHS2000_A.to_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_A.shp')
GHS2000_B.to_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_B.shp')
GHS2000_C.to_file(temp_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_C.shp')

### Merge PolyPoints 
- Merge polypoints outputs & save
- Find FIDs that overlap boarders

In [None]:
# file paths

poly_A = gpd.read_file(ERLv2_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_A_polypoints.shp')
poly_B = gpd.read_file(ERLv2_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_B_polypoints.shp')
poly_C = gpd.read_file(ERLv2_data+'GHS_POP_GPW42000_GLOBE_R2015A_54009_1k_v1_0_Clip_1500c300_C_polypoints.shp')

out_data = 'GHS2000'

In [None]:
poly = pd.concat([poly_A, poly_B, poly_C], ignore_index = True) #poly_C

In [None]:
len(poly)

In [None]:
# Save out all Polygons
poly.to_file(ERLv2_data+out_data+'_polypoints_ALL.shp')

### Find Country Overlap

In [None]:
# open files
out_data = 'GHS2000'

file = out_data+'_polypoints_ALL.shp'
#file = 'WP2015_1500c300_polypoints.shp'

poly_all = gpd.read_file(ERLv2_data+file)

In [None]:
step1 = poly_all
step1['dup'] = poly_all['FID'].astype(str) + poly_all['country'] # Make new col with FID-Country
step2 = step1.drop_duplicates('dup', keep='first') # drop country-FID dups, keep first though
step3 = step2[step2.duplicated(subset=['FID'], keep=False)] # keep all with duplicated FIDS, drop unique due to no country overlap
step4 = step3.drop_duplicates('FID', keep='first') # drop country-FID dups remaining duplicates
step4

In [None]:
# Drop Western Sahara / Morocco and South Sudan / Sudan
step4 = step4[step4['country'] != 'Sudan']

print(len(step4))

step4 = step4[step4['country'] != 'Morocco']
print(len(step4))

# out = out[out.duplicated(subset=['FID'], keep=False)] # Keep all duplicated FIDS but remove any left overs


In [None]:
step4.to_file(ERLv2_data+out_data+'_polypoints_countryoverlap.shp')


### CRS for Africa Basemap

In [None]:
africa = gpd.read_file(ERLv2_data+'Africa.shp')
africa_fix = gpd.read_file(ERLv2_data+'Africa_coastLineFIxCountryOverlap.shp')

In [None]:
crs = {'init': 'epsg:4326'}

# GHS CRS
GHS_crs = {'ellps': 'WGS84',
 'lon_0': 0,
 'no_defs': True,
 'proj': 'moll',
 'units': 'm',
 'x_0': 0,
 'y_0': 0}

In [None]:
africa.crs

In [None]:
africa.crs = crs

In [None]:
africa_fix.crs

In [None]:
africa_out = africa.to_crs(GHS_crs)
africa_fix_out = africa_fix.to_crs(GHS_crs)

In [None]:
africa_out.to_file(ERLv2_data+'Africa_GHScrs.shp', driver = 'ESRI Shapefile')


In [None]:
africa_fix_out.to_file(ERLv2_data+'Africa_oastLineFIxCountryOverlap_GHScrs.shp', driver = 'ESRI Shapefile')

### Merge in Overlap Polys

In [22]:
GHS2015 = gpd.read_file(ERLv2_data+'GHS2015_polypoints_ALL.shp')
GHS2015_overlap = gpd.read_file(ERLv2_data+'GHS2015_polypoints_countryoverlap_clip.shp')

GHS2000 = gpd.read_file(ERLv2_data+'GHS2000_polypoints_ALL.shp')
GHS2000_overlap = gpd.read_file(ERLv2_data+'GHS2000_polypoints_countryoverlap_clip.shp')

# WP2015 = gpd.read_file(ERLv2_data+'WP2015_polypoints_ALL.shp')
# WP2015_overlap = gpd.read_file(ERLv2_data+'WP2015_polypoints_countryoverlap_clip.shp')

# LS2015 = gpd.read_file(ERLv2_data+'LS2015_polypoints_ALL.shp')
# LS2015_overlap = gpd.read_file(ERLv2_data+'LS2015_polypoints_countryoverlap_clip.shp')

# WPE2016 = gpd.read_file(ERLv2_data+'WPE2016_polypoints_ALL.shp')
# WPE2016_overlap = gpd.read_file(ERLv2_data+'WPE2016_polypoints_countryoverlap_clip.shp')

In [29]:
GHS2015[GHS2015['osm_name'] == 'Boumerdès']

Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,geometry
0,252600742,187,Algeria,Boumerdès,town,36.758882,3.470596,"POLYGON ((338405.452556 4443029.461868, 340405..."


In [30]:
GHS2000[GHS2000['osm_name'] == 'Boumerdès']

Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,geometry
1,252600742,624,Algeria,Boumerdès,town,36.758882,3.470596,"POLYGON ((258405.4525564685 4431029.461868489,..."


In [7]:
# assign file
dataset = 'WPE2016'
alldata = WPE2016
clipdata = WPE2016_overlap


In [8]:
alldata.head()

Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,geometry
0,252600742,201,Algeria,Boumerdès,town,36.758882,3.470596,"POLYGON ((3.876141 36.924416, 3.921057 36.9244..."
1,253167052,827,Algeria,Thenia,town,36.724986,3.556935,"POLYGON ((3.543762 36.744752, 3.570712 36.7447..."
2,253167208,201,Algeria,Zemmouri,town,36.786406,3.601221,"POLYGON ((3.876141 36.924416, 3.921057 36.9244..."
3,253291208,1504,Algeria,Lakhdaria,town,36.563944,3.596907,"POLYGON ((3.498846 36.601021, 3.516813 36.6010..."
4,253292622,268,Algeria,Draâ Ben Khedda,town,36.733332,3.958769,"POLYGON ((3.858174 36.888483, 3.876141 36.8884..."


In [9]:
clipdata.head()

Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,dup,ID,CODE,country_2,geometry
0,490570861,6813,Algeria,Marsa Ben M'Hidi,town,35.0818,-2.2044,6813Algeria,1,ALG,Algeria,"POLYGON ((-2.201630978031619 35.091844, -2.187..."
1,490570861,6813,Algeria,Marsa Ben M'Hidi,town,35.0818,-2.2044,6813Algeria,546,MOR,Morocco,"POLYGON ((-2.322265 35.118793, -2.295754166199..."
2,490570863,7121,Algeria,Boukanoun,town,34.97477,-2.09959,7121Algeria,1,ALG,Algeria,"POLYGON ((-2.115651 34.993029, -2.106668 34.99..."
3,490570863,7121,Algeria,Boukanoun,town,34.97477,-2.09959,7121Algeria,546,MOR,Morocco,"(POLYGON ((-2.088702 34.9654635999966, -2.0887..."
4,298699084,7278,Burundi,Bugarama,town,-3.29088,29.5463,7278Burundi,702,TAN,Tanzania,"(POLYGON ((30.529283 -2.451390557857081, 30.52..."


In [10]:
# reassign countries

clipdata['country'] = clipdata['country_2']
clipdata.head()


Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,dup,ID,CODE,country_2,geometry
0,490570861,6813,Algeria,Marsa Ben M'Hidi,town,35.0818,-2.2044,6813Algeria,1,ALG,Algeria,"POLYGON ((-2.201630978031619 35.091844, -2.187..."
1,490570861,6813,Morocco,Marsa Ben M'Hidi,town,35.0818,-2.2044,6813Algeria,546,MOR,Morocco,"POLYGON ((-2.322265 35.118793, -2.295754166199..."
2,490570863,7121,Algeria,Boukanoun,town,34.97477,-2.09959,7121Algeria,1,ALG,Algeria,"POLYGON ((-2.115651 34.993029, -2.106668 34.99..."
3,490570863,7121,Morocco,Boukanoun,town,34.97477,-2.09959,7121Algeria,546,MOR,Morocco,"(POLYGON ((-2.088702 34.9654635999966, -2.0887..."
4,298699084,7278,Tanzania,Bugarama,town,-3.29088,29.5463,7278Burundi,702,TAN,Tanzania,"(POLYGON ((30.529283 -2.451390557857081, 30.52..."


In [11]:
# remove columns from clip polys that are un needed
clipdata = clipdata.iloc[:, [0,1,2,3,4,5,6,-1]]

In [12]:
clipdata.head()

Unnamed: 0,osm_id,FID,country,osm_name,osm_type,lat,lon,geometry
0,490570861,6813,Algeria,Marsa Ben M'Hidi,town,35.0818,-2.2044,"POLYGON ((-2.201630978031619 35.091844, -2.187..."
1,490570861,6813,Morocco,Marsa Ben M'Hidi,town,35.0818,-2.2044,"POLYGON ((-2.322265 35.118793, -2.295754166199..."
2,490570863,7121,Algeria,Boukanoun,town,34.97477,-2.09959,"POLYGON ((-2.115651 34.993029, -2.106668 34.99..."
3,490570863,7121,Morocco,Boukanoun,town,34.97477,-2.09959,"(POLYGON ((-2.088702 34.9654635999966, -2.0887..."
4,298699084,7278,Tanzania,Bugarama,town,-3.29088,29.5463,"(POLYGON ((30.529283 -2.451390557857081, 30.52..."


In [13]:
# Add col to show clip poly or not

alldata['clippoly'] = 'no'
clipdata['clippoly'] = 'yes'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [14]:
# drop all duplicate FIDS, keep the first one
print(len(alldata))
alldata_drop = alldata.drop_duplicates('FID', keep = 'first')
print(len(alldata_drop))

6029
4591


In [15]:
# pull list of FIDs from overlap polys
clip_fid = clipdata['FID'].values

In [16]:
# remove FIDS from all list that are in clip poly
print(len(alldata_drop))
alldata_drop = alldata_drop[~alldata_drop['FID'].isin(clip_fid)]
print(len(alldata_drop))

4591
4559


In [17]:
# merge the data frames
print(len(clipdata))
poly_final = pd.concat([alldata_drop,clipdata])
print(len(poly_final))

70
4629


In [18]:
poly_final.to_file(ERLv2_data+dataset+'_polyFINAL.shp', driver = 'ESRI Shapefile')

#### Old Code

In [None]:
# FIDS with GHS2015_ALL to check

testA = poly[poly['FID'] == 27492] # double boarder
testB = poly[poly['FID'] == 31036] #lagos
testC = poly[poly['FID'] == 187] # random city only in Algeria 
testD = poly[poly['FID'] == 28] # large city in S africa
testE = poly[poly['FID'] == 18] # single city

test = pd.concat([testA, testB, testC, testD, testE])
test

In [None]:
# ## Code to find polygons that overlap boarders

# test = poly[poly.duplicated(subset=['FID'], keep=False)] # Keep all dup FIDS
# test['dup'] = test['FID'].astype(str) + test['country'] # Make new col with FID-Country
# test.head()

# out = test.drop_duplicates('dup', keep=False) # Drop all doubles FID-Country
# print(len(out))

# # out = out.drop_duplicates('FID', keep='first') # Drop all left double FIDs
# # print(len(out))

In [None]:
# Fake Data

fid = [1,2,3,2,2,3]
country = ['A','B','C','A','B','A']
pop = [10,11,12,11,11,12]

df = pd.DataFrame()
df['FID'] = fid
df['country'] = country
df['pop'] = pop

df


In [None]:
step1A = poly[poly['FID'] == 31036] #lagos
step1B = poly[poly['FID'] == 187] # random city only in Algeria 

step1 = pd.concat([step1A, step1B])

In [None]:
# Make new col with FID-Country

step1['dup'] = step1['FID'].astype(str) + step1['country'] # Make new col with FID-Country
step1

In [None]:
# drop duplicated country-FID, but keep first

step2 = step1.drop_duplicates('dup', keep='first')
step2

In [None]:
# keep all duplicated FIDS, remove singles

step3 = step2[step2.duplicated(subset=['FID'], keep=False)] 
step3

In [None]:
# Drop remaining duplicated FIDS 

step4 = step3.drop_duplicates('FID', keep='first')
step4