# ERL Reivew Updates

This notebook is to make new figures and tables for ERL Reviews.

By Cascade Tuholkse

4-27-19

### Dependencies

In [1]:
from rasterstats import zonal_stats
import rasterio
import geopandas as gpd
import operator
import numpy as np
import pandas as pd
import matplotlib 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from functools import reduce
import squarify
from scipy import stats as ss

### Functions

In [None]:
def gini(series):
    
    "Function calculates gini coefficent based on https://zhiyzuo.github.io/Plot-Lorenz/"
    ## series to array
    arr = series.values
    
    ## first sort
    sorted_arr = arr.copy(np.array)
    sorted_arr.sort()
    n = arr.size
    coef_ = 2. / n
    const_ = (n + 1.) / n
    weighted_sum = sum([(i+1)*yi for i, yi in enumerate(sorted_arr)])
    return coef_*weighted_sum/(sorted_arr.sum()) - const_

### Data

In [2]:
# File paths

data_raw = '../../data/raw/'
data_temp = '../../temp_data/'
data_interim = '../../interim/'
data_analysis = '/Users/cascade/Github/Pop-ERL/temp_data/ERL19/ERL_data/Data20190222/'
erl_data = '/Users/cascade/Github/Pop-ERL/temp_data/ERL19/ERL_data/'
downloads = '/Users/cascade/Downloads/'
erl_vs_data = '/Users/cascade/Github/Pop-ERL/temp_data/ERL19/ERLv2/'

### Clean data
This code cleans data to fix national boundary issues among polygons 2019-04-29

In [4]:
#urban merge files still have duplicates

# GHS is good
GHS2000m = gpd.read_file(data_analysis+'GHS_POP_GPW42000_urbanmerge_PopTot.shp', driver = 'ESRI Shapefile')
GHS2015m = gpd.read_file(data_analysis+'GHS_POP_GPW42015_urbanmerge_PopTot.shp', driver = 'ESRI Shapefile')


## Load Data

#WP 2015
WP2015_dup = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_final20190122.shp')
WP2015_Sudan = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_S_Sudan_1500c300_polyoverlapPopTot.shp')


#WPE
WPE2016_dup = gpd.read_file(data_analysis+'WPE_1KM_2016_final20190122.shp')
WPE2016_Sudan = gpd.read_file(data_analysis+'WPE_1KM_2016_Pop_Clip_S_Sudan_1500c300_polyoverlapPopTot.shp')

#LS 
LS2015_dup = gpd.read_file(data_analysis+'LS15_final20190122.shp')
LS2015_Sudan = gpd.read_file(data_analysis+'LS15_w001001_Clip_S_Sudan_1500c300_polyoverlapPopTot.shp')

In [12]:
# Merge in South Sudan ... do each 

frames1 = [WP2015_dup, WP2015_Sudan]
frames2 = [WPE2016_dup, WPE2016_Sudan]
frames3 = [LS2015_dup, LS2015_Sudan]


print(len(WP2015_dup))
WP2015_merge = pd.concat(frames1)
print(len(WP2015_dup))
WPE2016_merge = pd.concat(frames2)
LS2015_merge = pd.concat(frames3)

3854
3871


In [17]:
print(len(WP2015_dup))
print(len(WP2015_merge))

print(len(WPE2016_dup))
print(len(WPE2016_merge))

print(len(LS2015_dup))
print(len(LS2015_merge))

3854
3871
5598
5705
5552
5659


In [22]:
# Save Files ... these files have FID duplicates and those with <5000 people starting point
# for ERL v2 ALL figures and data ... 20190429_all.csv
GHS2015m.to_csv(erl_vs_data+'GHS2015_20190429_all.csv')
GHS2000m.to_csv(erl_vs_data+'GHS2000_20190429_all.csv')
WP2015_merge.to_csv(erl_data+'WP2015_20190429_all.csv')
WPE2016_merge.to_csv(erl_vs_data+'WPE2016_20190429_all.csv')
LS2015_merge.to_csv(erl_vs_data+'LS2015_20190429_all.csv')

GHS2015m.to_file(erl_vs_data+'GHS2015_20190429_all.shp', driver = 'ESRI Shapefile')
GHS2000m.to_file(erl_vs_data+'GHS2000_20190429_all.shp', driver = 'ESRI Shapefile')
WP2015_merge.to_file(erl_vs_data+'WP2015_20190429_all.shp', driver = 'ESRI Shapefile')
WPE2016_merge.to_file(erl_vs_data+'WPE2016_20190429_all.shp', driver = 'ESRI Shapefile')
LS2015_merge.to_file(erl_vs_data+'LS2015_20190429_all.shp', driver = 'ESRI Shapefile')

In [None]:
# Load data: ...20190222.shp files have FIDs removed, S Sudan added, rainfall zones, and regions, <5k removed

# GHS2000 = gpd.read_file(data_analysis+'GHS_POP_GPW42000_20190222.shp', driver = 'ESRI Shapefile')
# GHS2015 = gpd.read_file(data_analysis+'GHS_POP_GPW42015_20190222.shp', driver = 'ESRI Shapefile')
# WP2000 = gpd.read_file(data_analysis+'AFR_PPP_2000_adj_v2_20190222.shp', driver = 'ESRI Shapefile')
# WP2015 = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_20190222.shp', driver = 'ESRI Shapefile')
# LS2015 = gpd.read_file(data_analysis+'WPE_1KM_2016_20190222.shp', driver = 'ESRI Shapefile')
# WPE2016 = gpd.read_file(data_analysis+'LS15_20190222.shp', driver = 'ESRI Shapefile')

# Load data to fix polygon overlap problem



In [None]:
print(len(GHS2000))
print(len(GHS2015))
print(len(WP2015))
print(len(LS2015))
print(len(WPE2016))

### Chunks

#### Africa

In [None]:
# WP 2015 Chunks
WP2015_50k = WP2015.loc[(WP2015['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(WP2015_50k)

WP2015_100k = WP2015.loc[(WP2015['PopTot'] > 5*10**4) & (WP2015['PopTot'] <= 10**5), 'PopTot'].sum()
print(WP2015_100k)

WP2015_300k = WP2015.loc[(WP2015['PopTot'] > 10**5) & (WP2015['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(WP2015_300k)

WP2015_500k = WP2015.loc[(WP2015['PopTot'] > 3*10**5) & (WP2015['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(WP2015_500k)

WP2015_1m = WP2015.loc[(WP2015['PopTot'] > 5*10**5) & (WP2015['PopTot'] <= 10**6), 'PopTot'].sum()
print(WP2015_1m)

WP2015_5m = WP2015.loc[(WP2015['PopTot'] > 10**6) & (WP2015['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(WP2015_5m)

WP2015_5mplus = WP2015.loc[(WP2015['PopTot'] > 5*10**6), 'PopTot'].sum()
print(WP2015_5mplus)

WP2015_chunks = [WP2015_50k, WP2015_100k, WP2015_300k, WP2015_500k, WP2015_1m, WP2015_5m, WP2015_5mplus]

In [None]:
# LS 2015 Chunks
LS2015_50k = LS2015.loc[(LS2015['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(LS2015_50k)

LS2015_100k = LS2015.loc[(LS2015['PopTot'] > 5*10**4) & (LS2015['PopTot'] <= 10**5), 'PopTot'].sum()
print(LS2015_100k)

LS2015_300k = LS2015.loc[(LS2015['PopTot'] > 10**5) & (LS2015['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(LS2015_300k)

LS2015_500k = LS2015.loc[(LS2015['PopTot'] > 3*10**5) & (LS2015['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(LS2015_500k)

LS2015_1m = LS2015.loc[(LS2015['PopTot'] > 5*10**5) & (LS2015['PopTot'] <= 10**6), 'PopTot'].sum()
print(LS2015_1m)

LS2015_5m = LS2015.loc[(LS2015['PopTot'] > 10**6) & (LS2015['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(LS2015_5m)

LS2015_5mplus = LS2015.loc[(LS2015['PopTot'] > 5*10**6), 'PopTot'].sum()
print(LS2015_5mplus)

LS2015_chunks = [LS2015_50k, LS2015_100k, LS2015_300k, LS2015_500k, LS2015_1m, LS2015_5m, LS2015_5mplus]

In [None]:
# WPE 2016 Chunks
WPE2016_50k = WPE2016.loc[(WPE2016['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(WPE2016_50k)

WPE2016_100k = WPE2016.loc[(WPE2016['PopTot'] > 5*10**4) & (WPE2016['PopTot'] <= 10**5), 'PopTot'].sum()
print(WPE2016_100k)

WPE2016_300k = WPE2016.loc[(WPE2016['PopTot'] > 10**5) & (WPE2016['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(WPE2016_300k)

WPE2016_500k = WPE2016.loc[(WPE2016['PopTot'] > 3*10**5) & (WPE2016['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(WPE2016_500k)

WPE2016_1m = WPE2016.loc[(WPE2016['PopTot'] > 5*10**5) & (WPE2016['PopTot'] <= 10**6), 'PopTot'].sum()
print(WPE2016_1m)

WPE2016_5m = WPE2016.loc[(WPE2016['PopTot'] > 10**6) & (WPE2016['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(WPE2016_5m)

WPE2016_5mplus = WPE2016.loc[(WPE2016['PopTot'] > 5*10**6), 'PopTot'].sum()
print(WPE2016_5mplus)

WPE2016_chunks = [WPE2016_50k, WPE2016_100k, WPE2016_300k, WPE2016_500k, WPE2016_1m, WPE2016_5m, WPE2016_5mplus]

In [None]:
# GHS 2015 Chunks
GHS2015_50k = GHS2015.loc[(GHS2015['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2015_50k)

GHS2015_100k = GHS2015.loc[(GHS2015['PopTot'] > 5*10**4) & (GHS2015['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2015_100k)

GHS2015_300k = GHS2015.loc[(GHS2015['PopTot'] > 10**5) & (GHS2015['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2015_300k)

GHS2015_500k = GHS2015.loc[(GHS2015['PopTot'] > 3*10**5) & (GHS2015['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2015_500k)

GHS2015_1m = GHS2015.loc[(GHS2015['PopTot'] > 5*10**5) & (GHS2015['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2015_1m)

GHS2015_5m = GHS2015.loc[(GHS2015['PopTot'] > 10**6) & (GHS2015['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2015_5m)

GHS2015_5mplus = GHS2015.loc[(GHS2015['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2015_5mplus)

GHS2015_chunks = [GHS2015_50k, GHS2015_100k, GHS2015_300k, GHS2015_500k, GHS2015_1m, GHS2015_5m, GHS2015_5mplus]

In [None]:
# GHS 2000 Chunks
GHS2000_50k = GHS2000.loc[(GHS2000['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2000_50k)

GHS2000_100k = GHS2000.loc[(GHS2000['PopTot'] > 5*10**4) & (GHS2000['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2000_100k)

GHS2000_300k = GHS2000.loc[(GHS2000['PopTot'] > 10**5) & (GHS2000['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2000_300k)

GHS2000_500k = GHS2000.loc[(GHS2000['PopTot'] > 3*10**5) & (GHS2000['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2000_500k)

GHS2000_1m = GHS2000.loc[(GHS2000['PopTot'] > 5*10**5) & (GHS2000['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2000_1m)

GHS2000_5m = GHS2000.loc[(GHS2000['PopTot'] > 10**6) & (GHS2000['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2000_5m)

GHS2000_5mplus = GHS2000.loc[(GHS2000['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2000_5mplus)

GHS2000_chunks = [GHS2000_50k, GHS2000_100k, GHS2000_300k, GHS2000_500k, GHS2000_1m, GHS2000_5m, GHS2000_5mplus]

#### Rain Zones

In [None]:
# Chunk by Rain Zone 

# GHS 2000
GHS2000_arid = GHS2000[GHS2000['rain_zone'] == 'Arid']
GHS2000_semi = GHS2000[GHS2000['rain_zone'] == 'Semi-arid']
GHS2000_sub = GHS2000[GHS2000['rain_zone'] == 'Sub-humid']
GHS2000_humid = GHS2000[GHS2000['rain_zone'] == 'Humid']

# GHS 2015
GHS2015_arid = GHS2015[GHS2015['rain_zone'] == 'Arid']
GHS2015_semi = GHS2015[GHS2015['rain_zone'] == 'Semi-arid']
GHS2015_sub = GHS2015[GHS2015['rain_zone'] == 'Sub-humid']
GHS2015_humid = GHS2015[GHS2015['rain_zone'] == 'Humid']


#### GHS 2000

In [None]:
# GHS 2000 Arid Chunks 
GHS2000_arid_50k = GHS2000_arid.loc[(GHS2000_arid['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2000_arid_50k)

GHS2000_arid_100k = GHS2000_arid.loc[(GHS2000_arid['PopTot'] > 5*10**4) & (GHS2000_arid['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2000_arid_100k)

GHS2000_arid_300k = GHS2000_arid.loc[(GHS2000_arid['PopTot'] > 10**5) & (GHS2000_arid['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2000_arid_300k)

GHS2000_arid_500k = GHS2000_arid.loc[(GHS2000_arid['PopTot'] > 3*10**5) & (GHS2000_arid['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2000_arid_500k)

GHS2000_arid_1m = GHS2000_arid.loc[(GHS2000_arid['PopTot'] > 5*10**5) & (GHS2000_arid['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2000_arid_1m)

GHS2000_arid_5m = GHS2000_arid.loc[(GHS2000_arid['PopTot'] > 10**6) & (GHS2000_arid['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2000_arid_5m)

GHS2000_arid_5mplus = GHS2000_arid.loc[(GHS2000_arid['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2000_arid_5mplus)

GHS2000_arid_chunks = [GHS2000_arid_50k, GHS2000_arid_100k, GHS2000_arid_300k, GHS2000_arid_500k,
                      GHS2000_arid_1m, GHS2000_arid_5m, GHS2000_arid_5mplus]


In [None]:
# GHS Semi Arid Chunks 
GHS2000_semi_50k = GHS2000_semi.loc[(GHS2000_semi['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2000_semi_50k)

GHS2000_semi_100k = GHS2000_semi.loc[(GHS2000_semi['PopTot'] > 5*10**4) & (GHS2000_semi['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2000_semi_100k)

GHS2000_semi_300k = GHS2000_semi.loc[(GHS2000_semi['PopTot'] > 10**5) & (GHS2000_semi['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2000_semi_300k)

GHS2000_semi_500k = GHS2000_semi.loc[(GHS2000_semi['PopTot'] > 3*10**5) & (GHS2000_semi['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2000_semi_500k)

GHS2000_semi_1m = GHS2000_semi.loc[(GHS2000_semi['PopTot'] > 5*10**5) & (GHS2000_semi['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2000_semi_1m)

GHS2000_semi_5m = GHS2000_semi.loc[(GHS2000_semi['PopTot'] > 10**6) & (GHS2000_semi['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2000_semi_5m)

GHS2000_semi_5mplus = GHS2000_semi.loc[(GHS2000_semi['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2000_semi_5mplus)

GHS2000_semi_chunks = [GHS2000_semi_50k, GHS2000_semi_100k, GHS2000_semi_300k, GHS2000_semi_500k,
                      GHS2000_semi_1m, GHS2000_semi_5m, GHS2000_semi_5mplus]

In [None]:
# GHS Sub Humid Chunks 
GHS2000_sub_50k = GHS2000_sub.loc[(GHS2000_sub['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2000_sub_50k)

GHS2000_sub_100k = GHS2000_sub.loc[(GHS2000_sub['PopTot'] > 5*10**4) & (GHS2000_sub['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2000_sub_100k)

GHS2000_sub_300k = GHS2000_sub.loc[(GHS2000_sub['PopTot'] > 10**5) & (GHS2000_sub['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2000_sub_300k)

GHS2000_sub_500k = GHS2000_sub.loc[(GHS2000_sub['PopTot'] > 3*10**5) & (GHS2000_sub['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2000_sub_500k)

GHS2000_sub_1m = GHS2000_sub.loc[(GHS2000_sub['PopTot'] > 5*10**5) & (GHS2000_sub['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2000_sub_1m)

GHS2000_sub_5m = GHS2000_sub.loc[(GHS2000_sub['PopTot'] > 10**6) & (GHS2000_sub['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2000_sub_5m)

GHS2000_sub_5mplus = GHS2000_sub.loc[(GHS2000_sub['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2000_sub_5mplus)

GHS2000_sub_chunks = [GHS2000_sub_50k, GHS2000_sub_100k, GHS2000_sub_300k, GHS2000_sub_500k,
                      GHS2000_sub_1m, GHS2000_sub_5m, GHS2000_sub_5mplus]

In [None]:
# GHS Humid Chunks 
GHS2000_humid_50k = GHS2000_humid.loc[(GHS2000_humid['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2000_humid_50k)

GHS2000_humid_100k = GHS2000_humid.loc[(GHS2000_humid['PopTot'] > 5*10**4) & (GHS2000_humid['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2000_humid_100k)

GHS2000_humid_300k = GHS2000_humid.loc[(GHS2000_humid['PopTot'] > 10**5) & (GHS2000_humid['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2000_humid_300k)

GHS2000_humid_500k = GHS2000_humid.loc[(GHS2000_humid['PopTot'] > 3*10**5) & (GHS2000_humid['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2000_humid_500k)

GHS2000_humid_1m = GHS2000_humid.loc[(GHS2000_humid['PopTot'] > 5*10**5) & (GHS2000_humid['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2000_humid_1m)

GHS2000_humid_5m = GHS2000_humid.loc[(GHS2000_humid['PopTot'] > 10**6) & (GHS2000_humid['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2000_humid_5m)

GHS2000_humid_5mplus = GHS2000_humid.loc[(GHS2000_humid['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2000_humid_5mplus)

GHS2000_humid_chunks = [GHS2000_humid_50k, GHS2000_humid_100k, GHS2000_humid_300k, GHS2000_humid_500k,
                      GHS2000_humid_1m, GHS2000_humid_5m, GHS2000_humid_5mplus]

#### GHS 2015

In [None]:
# GHS 2015 Arid Chunks 
GHS2015_arid_50k = GHS2015_arid.loc[(GHS2015_arid['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2015_arid_50k)

GHS2015_arid_100k = GHS2015_arid.loc[(GHS2015_arid['PopTot'] > 5*10**4) & (GHS2015_arid['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2015_arid_100k)

GHS2015_arid_300k = GHS2015_arid.loc[(GHS2015_arid['PopTot'] > 10**5) & (GHS2015_arid['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2015_arid_300k)

GHS2015_arid_500k = GHS2015_arid.loc[(GHS2015_arid['PopTot'] > 3*10**5) & (GHS2015_arid['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2015_arid_500k)

GHS2015_arid_1m = GHS2015_arid.loc[(GHS2015_arid['PopTot'] > 5*10**5) & (GHS2015_arid['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2015_arid_1m)

GHS2015_arid_5m = GHS2015_arid.loc[(GHS2015_arid['PopTot'] > 10**6) & (GHS2015_arid['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2015_arid_5m)

GHS2015_arid_5mplus = GHS2015_arid.loc[(GHS2015_arid['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2015_arid_5mplus)

GHS2015_arid_chunks = [GHS2015_arid_50k, GHS2015_arid_100k, GHS2015_arid_300k, GHS2015_arid_500k,
                      GHS2015_arid_1m, GHS2015_arid_5m, GHS2015_arid_5mplus]


In [None]:
# GHS Semi Arid Chunks 
GHS2015_semi_50k = GHS2015_semi.loc[(GHS2015_semi['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2015_semi_50k)

GHS2015_semi_100k = GHS2015_semi.loc[(GHS2015_semi['PopTot'] > 5*10**4) & (GHS2015_semi['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2015_semi_100k)

GHS2015_semi_300k = GHS2015_semi.loc[(GHS2015_semi['PopTot'] > 10**5) & (GHS2015_semi['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2015_semi_300k)

GHS2015_semi_500k = GHS2015_semi.loc[(GHS2015_semi['PopTot'] > 3*10**5) & (GHS2015_semi['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2015_semi_500k)

GHS2015_semi_1m = GHS2015_semi.loc[(GHS2015_semi['PopTot'] > 5*10**5) & (GHS2015_semi['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2015_semi_1m)

GHS2015_semi_5m = GHS2015_semi.loc[(GHS2015_semi['PopTot'] > 10**6) & (GHS2015_semi['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2015_semi_5m)

GHS2015_semi_5mplus = GHS2015_semi.loc[(GHS2015_semi['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2015_semi_5mplus)

GHS2015_semi_chunks = [GHS2015_semi_50k, GHS2015_semi_100k, GHS2015_semi_300k, GHS2015_semi_500k,
                      GHS2015_semi_1m, GHS2015_semi_5m, GHS2015_semi_5mplus]

In [None]:
# GHS Sub Humid Chunks 
GHS2015_sub_50k = GHS2015_sub.loc[(GHS2015_sub['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2015_sub_50k)

GHS2015_sub_100k = GHS2015_sub.loc[(GHS2015_sub['PopTot'] > 5*10**4) & (GHS2015_sub['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2015_sub_100k)

GHS2015_sub_300k = GHS2015_sub.loc[(GHS2015_sub['PopTot'] > 10**5) & (GHS2015_sub['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2015_sub_300k)

GHS2015_sub_500k = GHS2015_sub.loc[(GHS2015_sub['PopTot'] > 3*10**5) & (GHS2015_sub['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2015_sub_500k)

GHS2015_sub_1m = GHS2015_sub.loc[(GHS2015_sub['PopTot'] > 5*10**5) & (GHS2015_sub['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2015_sub_1m)

GHS2015_sub_5m = GHS2015_sub.loc[(GHS2015_sub['PopTot'] > 10**6) & (GHS2015_sub['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2015_sub_5m)

GHS2015_sub_5mplus = GHS2015_sub.loc[(GHS2015_sub['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2015_sub_5mplus)

GHS2015_sub_chunks = [GHS2015_sub_50k, GHS2015_sub_100k, GHS2015_sub_300k, GHS2015_sub_500k,
                      GHS2015_sub_1m, GHS2015_sub_5m, GHS2015_sub_5mplus]

In [None]:
# GHS Humid Chunks 
GHS2015_humid_50k = GHS2015_humid.loc[(GHS2015_humid['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(GHS2015_humid_50k)

GHS2015_humid_100k = GHS2015_humid.loc[(GHS2015_humid['PopTot'] > 5*10**4) & (GHS2015_humid['PopTot'] <= 10**5), 'PopTot'].sum()
print(GHS2015_humid_100k)

GHS2015_humid_300k = GHS2015_humid.loc[(GHS2015_humid['PopTot'] > 10**5) & (GHS2015_humid['PopTot'] <= 3*10**5), 'PopTot'].sum()
print(GHS2015_humid_300k)

GHS2015_humid_500k = GHS2015_humid.loc[(GHS2015_humid['PopTot'] > 3*10**5) & (GHS2015_humid['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(GHS2015_humid_500k)

GHS2015_humid_1m = GHS2015_humid.loc[(GHS2015_humid['PopTot'] > 5*10**5) & (GHS2015_humid['PopTot'] <= 10**6), 'PopTot'].sum()
print(GHS2015_humid_1m)

GHS2015_humid_5m = GHS2015_humid.loc[(GHS2015_humid['PopTot'] > 10**6) & (GHS2015_humid['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(GHS2015_humid_5m)

GHS2015_humid_5mplus = GHS2015_humid.loc[(GHS2015_humid['PopTot'] > 5*10**6), 'PopTot'].sum()
print(GHS2015_humid_5mplus)

GHS2015_humid_chunks = [GHS2015_humid_50k, GHS2015_humid_100k, GHS2015_humid_300k, GHS2015_humid_500k,
                      GHS2015_humid_1m, GHS2015_humid_5m, GHS2015_humid_5mplus]

### Bar Plots

In [None]:
# Bar plot by Dataset for 2015

import matplotlib.pyplot as plt

dictionary = plt.figure()

# Tick Lables
ticks_bar = ['<50K', '50-100K', '100-300K','300-500K', '500K-1M', '1-5M' , '>5M']

# make plot
sns.set(font_scale=3)
fig, ax = plt.subplots(nrows=1, ncols=1)

# Bar locations
a = [0-.4,1-.4,2-.4,3-.4,4-.4,5-.4,6-.4]
b = [0-.2,1-.2,2-.2,3-.2,4-.2,5-.2,6-.2]
c = [0,1,2,3,4,5,6]
d = [0+.2,1+.2,2+.2,3+.2,4+.2,5+.2,6+.2]


# Bars
plt.bar(a, [x / 10**6 for x in WP2015_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Blue')
plt.bar(b, [x / 10**6 for x in LS2015_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Green')
plt.bar(c, [x / 10**6 for x in GHS2015_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Orange')
plt.bar(d, [x / 10**6 for x in WPE2016_chunks], width=0.2, align='center', alpha = 0.7, color = 'Purple')

# Legend 
bar_leg = ['WorldPop 2015', 'LandScan 2015', 'GHS-Pop 2015', 'WPE 2016']
plt.legend(bar_leg,loc=2, facecolor= 'white', edgecolor = 'white')

# Ticks
plt.xticks(range(len(ticks_bar)), ticks_bar, size = 28)
#plt.gca().set_yscale('log')

# Size & Color
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(25, 15)
ax.set_facecolor('White')

# Labels 
plt.xlabel('Settlement Size', size = 32)
plt.ylabel('Population (millions)', size = 32)
plt.title('Total Urban Population by Settlement Size for Africa')

# Save
# fig.savefig('/Users/cascade/Desktop/'+'PopAll_bar.png', dpi=700, transparent=False)


In [None]:
# Bar plot by for GHS 2000 & 2015 

import matplotlib.pyplot as plt

dictionary = plt.figure()

# Tick Lables
ticks_bar = ['<50K', '50-100K', '100-300K','300-500K', '500K-1M', '1-5M' , '>5M']

# make plot
sns.set(font_scale=3)
fig, ax = plt.subplots(nrows=1, ncols=1)

# Bar locations
a = [0-.4,1-.4,2-.4,3-.4,4-.4,5-.4,6-.4]
b = [0-.2,1-.2,2-.2,3-.2,4-.2,5-.2,6-.2]
c = [0,1,2,3,4,5,6]
d = [0+.2,1+.2,2+.2,3+.2,4+.2,5+.2,6+.2]


# Bars
plt.bar(a, [x / 10**6 for x in GHS2000_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Blue')
plt.bar(b, [x / 10**6 for x in GHS2015_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Green')

# Legend 
bar_leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
plt.legend(bar_leg,loc=2, facecolor= 'white', edgecolor = 'white')

# Ticks
plt.xticks(range(len(ticks_bar)), ticks_bar, size = 28)
#plt.gca().set_yscale('log')

# Size & Color
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(25, 15)
ax.set_facecolor('White')

# Labels 
plt.xlabel('Settlement Size', size = 32)
plt.ylabel('Population (millions)', size = 32)
plt.title('Total Urban Population by Settlement Size for Africa 2000 & 2015')

# Save
# fig.savefig('/Users/cascade/Desktop/'+'PopAll_bar.png', dpi=700, transparent=False)


In [None]:
# Bar plot by for GHS 2000 & 2015 Arid

import matplotlib.pyplot as plt

dictionary = plt.figure()

# Tick Lables
ticks_bar = ['<50K', '50-100K', '100-300K','300-500K', '500K-1M', '1-5M' , '>5M']

# make plot
sns.set(font_scale=3)
fig, ax = plt.subplots(nrows=1, ncols=1)

# Bar locations
c = [0-.4,1-.4,2-.4,3-.4,4-.4,5-.4,6-.4]
d = [0-.1,1-.1,2-.1,3-.1,4-.1,5-.1,6-.1]
a = [0,1,2,3,4,5,6]
b = [0+.1,1+.1,2+.1,3+.1,4+.1,5+.1,6+.1]


# Bars
plt.bar(d, [x / 10**6 for x in GHS2000_humid_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Blue')
plt.bar(b, [x / 10**6 for x in GHS2015_humid_chunks], width=0.2, align='center', alpha  = 0.7, color = 'Green')

# Legend 
bar_leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
plt.legend(bar_leg,loc=2, facecolor= 'white', edgecolor = 'white')

# Ticks
plt.xticks(range(len(ticks_bar)), ticks_bar, size = 28)
#plt.gca().set_yscale('log')

# Size & Color
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(25, 15)
ax.set_facecolor('White')

# Labels 
plt.xlabel('Settlement Size', size = 32)
plt.ylabel('Population (millions)', size = 32)
plt.title('Total Urban Population by Settlement Size for Africa 2000 & 2015')

# Save
# fig.savefig('/Users/cascade/Desktop/'+'PopAll_bar.png', dpi=700, transparent=False)

### Lorenz Curves

In [None]:
test = GHS2015[GHS2015[col] == area]
test = GHS2015
test = test[test.PopTot>5*10**6]
test

In [None]:
# Select Data for Lorenz

# All Africa

#area = 'Africa'
# X = np.sort(np.array(GHS2000.PopTot))
# Y = np.sort(np.array(GHS2015.PopTot))

# # By group
col = 'rain_zone' # column
area = 'Arid' # geography to distribut 

X = np.sort(np.array(GHS2000[GHS2000[col] == area].PopTot))
Y = np.sort(np.array(GHS2015[GHS2015[col] == area].PopTot))

print(len(X))
print(len(Y))

# Set Limits
# X = X[(X < 10**7)]
# Y = Y[(Y < 10**7)]

print(len(X))
print(len(Y))

# Data for curve one
X_lorenz = X.cumsum() / X.sum()
X_lorenz = np.insert(X_lorenz, 0, 0)
X_lorenz[0], X_lorenz[-1]

# Data for curve two 
Y_lorenz = Y.cumsum() / Y.sum()
Y_lorenz = np.insert(Y_lorenz, 0, 0)
Y_lorenz[0], Y_lorenz[-1]

In [None]:
# Lorenz Curve Plot

%matplotlib inline

## Size & number
sns.set(font_scale=3)
fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))


# Curve Plots
# Alpha for Botswana and S. Leone is 0.7, else 0.5
ax1.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz, 
           marker='.', color='Green', s=100, alpha = 0.7)

ax1.scatter(np.arange(Y_lorenz.size)/(Y_lorenz.size-1), Y_lorenz, 
           marker='.', color='#ff01bc', s=100, alpha = 0.7)

# Title
plt.title(area)


# Legend
leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
plt.legend(leg,loc=2, markerscale=3)

## line plot of of 90% 40% 
# ax1.plot([0.9,0.9], [0,1] , color='k', alpha = 0.25)
# ax1.plot([0,1], [0.3,0.3] , color='k', alpha = 0.25)

#remove ticks
# ax.set_xticks([])
# ax.set_yticks([])

# Grid
plt.grid(b=True, color = 'k', alpha = 0.1, marker = '.')

# plt.setp(ax1.xaxis.get_gridlines(), clip_path = [0,1])
# plt.setp(ax1.yaxis.get_gridlines(), clip_path = [[0,0], [1,1]])
plt.figure.frameon = True
ax1.set_facecolor('White')

## line plot of of 90% 40% 
ax1.plot([0.9,0.9], [0,1] , color='k', alpha = 1)
#ax1.plot([0,1], [0.3,0.3] , color='k', alpha = 0.25)

#fig.savefig('/Users/cascade/Desktop/'+area+'_Lorenz.png', dpi=700)


### Descriptives

In [None]:
# All countries

countries = pd.Series(GHS2015.country, dtype="category")

countries = countries.cat.categories.tolist()
countries[0:5]

In [None]:
# Drop Mega

data1 = GHS2000[GHS2000.PopTot < 10**7]
data2 = GHS2015[GHS2015.PopTot < 10**7]

In [None]:
## Descriptives and Zipfs Plots for all countries

data1 = GHS2000[GHS2000.PopTot < 10**7]
data2 = GHS2015[GHS2015.PopTot < 10**7]


# all countires

df = pd.DataFrame()

arr = []


for country in countries: 
    
    test1 = data1[data1['country'] == country]
    test2 = data2[data2['country'] == country]
    
    # counts
    num2000= test1.PopTot.count()
    num2015= test2.PopTot.count()
    
    #Median
    m2000= test1.PopTot.median()
    m2015= test2.PopTot.median()
    
    #gini
 
    g2000 = gini(test1.PopTot)
    g2015 = gini(test2.PopTot)

    #zipf
    X1_zipf = np.sort(test1.PopTot) #sort the values
    Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
    Y1_zipf = Y1_zipf[::-1] # Re order range

    X1_zipf_log = np.log(X1_zipf)
    Y1_zipf_log = np.log(Y1_zipf)

    X2_zipf = np.sort(test2.PopTot)
    Y2_zipf = list(range(1, len(X2_zipf)+1))
    Y2_zipf = Y2_zipf[::-1]

    X2_zipf_log = np.log(X2_zipf)
    Y2_zipf_log = np.log(Y2_zipf)
    
    fit1 = ss.linregress(X1_zipf_log, Y1_zipf_log)
    fit2 = ss.linregress(X2_zipf_log, Y2_zipf_log)

    #Fit

    s2000 = fit1[0] # Slope 2000
    p2000 = fit1[4] # p val 2000
    
    s2015 = fit2[0] # Slope 2015
    p2015 = fit2[4] # p val 2015
    
    # Make dataframe
    df[country] = (num2000, num2015, m2000, m2015, g2000, g2015, s2000, p2000, s2015, p2015)

    # Zipf's law

    ## Size & number
    sns.set(font_scale=3)
    fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))

    # Plot
    plt.scatter(X1_zipf_log, Y1_zipf_log , marker='.', color='purple', s=100, alpha = 0.7)
    plt.scatter(X2_zipf_log, Y2_zipf_log , marker='.', color='orange', s=100, alpha = 0.7)

    # Legend
    leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
    plt.legend(leg,loc=1, markerscale=3, facecolor = 'white', edgecolor = 'white')

    # Fit Lines
    sns.regplot(X1_zipf_log, Y1_zipf_log, color = 'purple')
    sns.regplot(X2_zipf_log, Y2_zipf_log, color = 'orange')

    # Title
    plt.title(country)

    # Labels
    plt.xlabel('')
    plt.ylabel('')

    # Set Ticks
    plt.xticks([np.log(10**4), np.log(10**5), np.log(10**6), np.log(10**7)], 
               ['$10^4$', '$10^5$', '$10^6$', '$10^7$'])

    plt.yticks([np.log(10), np.log(100), np.log(1000), np.log(10000),  np.log(100000)], 
               ['10', '$10^2$', '$10^3$', '$10^4$', '$10^5$'])

    # Set Background Color
    ax1.set_facecolor('White')

    axes = plt.gca()
    axes.set_xlim([7,16])
    #axes.set_xlim([min(X2_zipf_log), max(X2_zipf_log)])
    axes.set_ylim([1,8])

    ## Save Zipfs
    fig.savefig('/Users/cascade/Desktop/Zipfs20190429/'+country+'_Zipf.png', dpi=300)
    

In [None]:
#df.to_csv(data_analysis+'ALLCOUNTRIES_TABLE2_ERL20190429.csv')

In [None]:
## Africa

print(len(data1))
print(len(data2))

print(data1.PopTot.median())
print(data2.PopTot.median())

print(gini(data1.PopTot))
print(gini(data2.PopTot))


#zipf
X1_zipf = np.sort(data1.PopTot) #sort the values
Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
Y1_zipf = Y1_zipf[::-1] # Re order range

X1_zipf_log = np.log(X1_zipf)
Y1_zipf_log = np.log(Y1_zipf)

X2_zipf = np.sort(data2.PopTot)
Y2_zipf = list(range(1, len(X2_zipf)+1))
Y2_zipf = Y2_zipf[::-1]

X2_zipf_log = np.log(X2_zipf)
Y2_zipf_log = np.log(Y2_zipf)

fit1 = ss.linregress(X1_zipf_log, Y1_zipf_log)
fit2 = ss.linregress(X2_zipf_log, Y2_zipf_log)

#Fit

s2000 = fit1[0] # Slope 2000
p2000 = fit1[4] # p val 2000

s2015 = fit2[0] # Slope 2015
p2015 = fit2[4] # p val 2015

print('')
print(s2000)
print(p2000)
print('')
print(s2015)
print(p2015)

### Counts

In [None]:
test = gpd.read_file(data_analysis+'GHS_POP_GPW42000_urbanmerge_PopTot.shp', driver = 'ESRI Shapefile')

In [None]:
#urban merge files still have duplicates

# GHS is good
GHS2000m = gpd.read_file(data_analysis+'GHS_POP_GPW42000_urbanmerge_PopTot.shp', driver = 'ESRI Shapefile')
GHS2015m = gpd.read_file(data_analysis+'GHS_POP_GPW42015_urbanmerge_PopTot.shp', driver = 'ESRI Shapefile')


## Load Data

#WP 2015
WP2015_dup = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_final20190122.shp')
WP2015_Sudan = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_S_Sudan_1500c300_polyoverlapPopTot.shp')


#WPE
WPE2016_dup = gpd.read_file(data_analysis+'WPE_1KM_2016_final20190122.shp')
WPE2016_Sudan = gpd.read_file(data_analysis+'WPE_1KM_2016_Pop_Clip_S_Sudan_1500c300_polyoverlapPopTot.shp')

#LS 
LS2015_dup = gpd.read_file(data_analysis+'LS15_final20190122.shp')
LS2015_Sudan = gpd.read_file(data_analysis+'LS15_w001001_Clip_S_Sudan_1500c300_polyoverlapPopTot.shp')


In [None]:
# merge

# Merge in South Sudan ... do each 
print(len(LS2015))
print(len(LS2015_Sudan))
frames = [LS2015, LS2015_Sudan]

LS2015m = pd.concat(frames)
print(len(LS2015m))

In [None]:
GHS2000[GHS2000['osm_type'] == 'city']

In [None]:
8368 - (len(GHS2000m[GHS2000m['osm_type'] == 'town']))

In [None]:
test = GHS2015[GHS2015['PopTot'] > 5*10**6]
test

In [None]:
plt.hist(np.log(GHS2015.PopTot))
np.std(np.log(GHS2015.PopTot))