# City Size Distribution Analysis

This note book produces Lorenz Curves and Zipf Law Estimations, along with gini coefficents.

Code adapted for Lorzen curves from: https://zhiyzuo.github.io/Plot-Lorenz/

-Cascade 2019-02-04

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.legend_handler import HandlerLine2D
from scipy import stats as ss

## Data In

In [None]:
# File paths

data_raw = '../../data/raw/'
data_temp = '../../temp_data/'
data_interim = '../../interim/'
data_analysis = '../../temp_data/ERL_data/Data20190222/'
erl_data = '../../temp_data/ERL_data/'
downloads = '/Users/cascade/Downloads/'

In [None]:
# Data

# GHS2000 = gpd.read_file(erl_data+'Data20190222/GHS_POP_GPW42000_final20190122.shp')
# GHS2015 = gpd.read_file(erl_data+'Data20190222/GHS_POP_GPW42015_final20190122.shp')
# WP2000 = gpd.read_file(erl_data+'Data20190222/AFR_PPP_2000_adj_v2_final20190122.shp')
# WP2015 = gpd.read_file(erl_data+'Data20190222/AFR_PPP_2015_adj_v2_final20190122.shp')
# LS2015 = gpd.read_file(erl_data+'Data20190222/WPE_1KM_2016_final20190122.shp')
# WPE2016 = gpd.read_file(erl_data+'Data20190222/LS15_final20190122.shp')

# Data with Rain Zones & Regions & FIDs Dropped (2019-02-13)

# GHS2000 = gpd.read_file(erl_data+'GHS_POP_GPW42000_20190213.shp')
# GHS2015 = gpd.read_file(erl_data+'GHS_POP_GPW42015_20190213.shp')
# WP2000 = gpd.read_file(erl_data+'AFR_PPP_2000_adj_v2_20190213.shp')
# WP2015 = gpd.read_file(erl_data+'AFR_PPP_2015_adj_v2_20190213.shp')
# LS2015 = gpd.read_file(erl_data+'WPE_1KM_2016_20190213.shp')
# WPE2016 = gpd.read_file(erl_data+'LS15_20190213.shp')

# Load data: ...20190222.shp files have FIDs removed, S Sudan added, rainfall zones, and regions

GHS2000 = gpd.read_file(data_analysis+'GHS_POP_GPW42000_20190222.shp', driver = 'ESRI Shapefile')
GHS2015 = gpd.read_file(data_analysis+'GHS_POP_GPW42015_20190222.shp', driver = 'ESRI Shapefile')
WP2000 = gpd.read_file(data_analysis+'AFR_PPP_2000_adj_v2_20190222.shp', driver = 'ESRI Shapefile')
WP2015 = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_20190222.shp', driver = 'ESRI Shapefile')
LS2015 = gpd.read_file(data_analysis+'WPE_1KM_2016_20190222.shp', driver = 'ESRI Shapefile')
WPE2016 = gpd.read_file(data_analysis+'LS15_20190222.shp', driver = 'ESRI Shapefile')

In [None]:
# List of datasets

datasets_in = [GHS2000, GHS2015, WP2000, WP2015, LS2015, WPE2016]

In [None]:
for dataset in datasets_in:
    print(len(dataset))

In [None]:
GHS2000.head()

## Functions

In [None]:
def gini(series):
    
    "Function calculates gini coefficent based on https://zhiyzuo.github.io/Plot-Lorenz/"
    ## series to array
    arr = series.values
    
    ## first sort
    sorted_arr = arr.copy(np.array)
    sorted_arr.sort()
    n = arr.size
    coef_ = 2. / n
    const_ = (n + 1.) / n
    weighted_sum = sum([(i+1)*yi for i, yi in enumerate(sorted_arr)])
    return coef_*weighted_sum/(sorted_arr.sum()) - const_

In [None]:
def lorenz_curve(X):
    
    "Function plots Lorenz curves based on https://zhiyzuo.github.io/Plot-Lorenz/"
    X_lorenz = X.cumsum() / X.sum()
    X_lorenz = np.insert(X_lorenz, 0, 0) 
    X_lorenz[0], X_lorenz[-1]
    fig, ax = plt.subplots(figsize=[6,6])
    ## scatter plot of Lorenz curve
    ax.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz, 
               marker='x', color='darkgreen', s=100)
    ## line plot of equality
    ax.plot([0,1], [0,1], color='k')

In [None]:
def gini_group(gpd_df, group, value, outcol):
    "Function calculates gini coef. for by groups for a pandas data frame. Returns data frame"
    
    gini = test.groupby(group).value.agg(gini)
    df = pd.DataFrame()
    df[group] = out.keys()
    df[outcol] = out.values

    return df

## Plots

In [None]:
# Select Data for Lorenz

# All Africa

# area = 'Africa'
# X = np.sort(np.array(GHS2000.PopTot))
# Y = np.sort(np.array(GHS2015.PopTot))

# By group
col = 'rain_zone' # column
area = 'Arid' # geography to distribut 

X = np.sort(np.array(GHS2000[GHS2000[col] == area].PopTot))
Y = np.sort(np.array(GHS2015[GHS2015[col] == area].PopTot))


print(len(X))
print(len(Y))

# Set Limits
X = X[(X < 5*10**6)]
Y = Y[(Y < 5*10**6)]

print(len(X))
print(len(Y))

# Data for curve one
X_lorenz = X.cumsum() / X.sum()
X_lorenz = np.insert(X_lorenz, 0, 0)
X_lorenz[0], X_lorenz[-1]

# Data for curve two 
Y_lorenz = Y.cumsum() / Y.sum()
Y_lorenz = np.insert(Y_lorenz, 0, 0)
Y_lorenz[0], Y_lorenz[-1]

In [None]:
# Lorenz Curve Plot

%matplotlib inline

## Size & number
sns.set(font_scale=3)
fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))


# Curve Plots
# Alpha for Botswana and S. Leone is 0.7, else 0.5
ax1.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz, 
           marker='.', color='Green', s=100, alpha = 0.7)

ax1.scatter(np.arange(Y_lorenz.size)/(Y_lorenz.size-1), Y_lorenz, 
           marker='.', color='#ff01bc', s=100, alpha = 0.7)


# Title
plt.title(area)


# Legend
leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
plt.legend(leg,loc=2, markerscale=3)

## line plot of of 90% 40% 
# ax1.plot([0.9,0.9], [0,1] , color='k', alpha = 0.25)
# ax1.plot([0,1], [0.3,0.3] , color='k', alpha = 0.25)

#remove ticks
# ax.set_xticks([])
# ax.set_yticks([])

# Set Color
plt.grid(b=True, color = 'k', alpha = 0.5, marker = '.')

# plt.setp(ax1.xaxis.get_gridlines(), clip_path = [0,1])
# plt.setp(ax1.yaxis.get_gridlines(), clip_path = [[0,0], [1,1]])
plt.figure.frameon = True
ax1.set_facecolor('White')


fig.savefig('/Users/cascade/Desktop/'+area+'_Lorenz.png', dpi=700)



In [None]:
# Select Data Zipf's law

# All Africa
area = 'Africa'
data1 = np.sort(np.array(GHS2000.PopTot))
data2 = np.sort(np.array(GHS2015.PopTot))

# By country 
# col = 'country' # column
# area = 'Ghana' # geography to distribut 

# data1 = np.sort(np.array(GHS2000[GHS2000[col] == area].PopTot))
# data2 = np.sort(np.array(GHS2015[GHS2015[col] == area].PopTot))

print(len(data1))
print(len(data2))

# Set Limits
data1 = data1[(data1 < 5*10**6)]
data2 = data2[(data2 < 5*10**6)]

print(len(data1))
print(len(data2))

X1_zipf = data1 #sort the values
Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
Y1_zipf = Y1_zipf[::-1] # Re order range

X1_zipf_log = np.log(X1_zipf)
Y1_zipf_log = np.log(Y1_zipf)

X2_zipf = data2
Y2_zipf = list(range(1, len(X2_zipf)+1))
Y2_zipf = Y2_zipf[::-1]

X2_zipf_log = np.log(X2_zipf)
Y2_zipf_log = np.log(Y2_zipf)

In [None]:
# Zipf's law

## Size & number
sns.set(font_scale=3)
fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))

# Plot
plt.scatter(X1_zipf_log, Y1_zipf_log , marker='.', color='purple', s=100, alpha = 0.7)
plt.scatter(X2_zipf_log, Y2_zipf_log , marker='.', color='orange', s=100, alpha = 0.7)

# Legend
leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
plt.legend(leg,loc=1, markerscale=3)

# Fit Lines
sns.regplot(X1_zipf_log, Y1_zipf_log, color = 'purple')
sns.regplot(X2_zipf_log, Y2_zipf_log, color = 'orange')

# Title
plt.title(area)

# Labels
plt.xlabel('')
plt.ylabel('')

# Set Ticks
#plt.yticks([1, 2, 3, 4, 5, 6, 7])

# Set Background Color
ax1.set_facecolor('White')

axes = plt.gca()
axes.set_xlim([7,16])
#axes.set_xlim([min(X2_zipf_log), max(X2_zipf_log)])
axes.set_ylim([1,11])

fig.savefig('/Users/cascade/Desktop/'+area+'_Zipf.png', dpi=700)

In [None]:
countries = ['Ghana, South Africa']

## Analysis

In [None]:
# Subset of Countries 
countries = ['Morocco', 'Mali', 'Senegal', 'Sierra Leone', 'Ghana',
            'Nigeria', 'Central African Republic', 'South Sudan',
            'Uganda', 'Kenya', 'Ethiopia', 'Angola', 'Botswana',
            'Zambia', 'South Africa']

In [139]:
# Decriptives

for country in countries: 
    
    print(country)

    test1 = GHS2000[GHS2000['country'] == country]
    test1 = test1[test1.PopTot <5*10**6]
    test2 = GHS2015[GHS2015['country'] == country]
    test2 = test2[test2.PopTot <5*10**6]
    
    print(test1.PopTot.count())
    print(test2.PopTot.count())
    print(test1.PopTot.median())
    print(test2.PopTot.median())
    
    # gini coef. 
    print(gini((test1.PopTot)))
    print(gini((test2.PopTot)))

Morocco
199
203
23257.623413085938
26597.354858398438
0.7422482536246433
0.767668256174427
Mali
157
186
11893.61264038086
13902.646545410156
0.6108782955039849
0.7164657145184983
Senegal
76
84
23770.626083374023
30857.70101928711
0.7216108360104942
0.749961004227315
Sierra Leone
35
39
13246.488098144531
15997.72738647461
0.6891147589184914
0.760918334658931
Ghana
227
218
19710.7421875
21813.060302734375
0.6906064873257425
0.7700491057254857
Nigeria
313
309
47794.46502685547
62549.62283325195
0.7033706955846382
0.7088897642781433
Central African Republic
63
74
14224.431640625
14653.702713012695
0.6113292756665569
0.6351955724688167
South Sudan
53
74
31285.855102539062
38382.74351501465
0.5167983789842183
0.556405298995936
Uganda
133
136
21309.400390625
30961.794998168945
0.6373470493367848
0.6703176264281008
Kenya
119
125
20530.794494628906
27990.63543701172
0.8032027146122462
0.7780787175709349
Ethiopia
338
395
34785.132080078125
44010.26153564453
0.5314698321498226
0.5875220731276363


  


In [138]:
for country in countries: 
    data1 = np.sort(np.array(GHS2000[GHS2000['country'] == country].PopTot))
    data2 = np.sort(np.array(GHS2015[GHS2015['country'] == country].PopTot))
    
    # Order the data for Zipf's law
    X1_zipf = data1 #sort the values
    Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
    Y1_zipf = Y1_zipf[::-1] # Re order range

    X1_zipf_log = np.log(X1_zipf)
    Y1_zipf_log = np.log(Y1_zipf)

    X2_zipf = data2
    Y2_zipf = list(range(1, len(X2_zipf)+1))
    Y2_zipf = Y2_zipf[::-1]

    X2_zipf_log = np.log(X2_zipf)
    Y2_zipf_log = np.log(Y2_zipf)
    
    fit1 = ss.linregress(X1_zipf_log, Y1_zipf_log)
    fit2 = ss.linregress(X2_zipf_log, Y2_zipf_log)

    print(country)
    print(fit1[0]) # Slope 2000
    print(fit1[4]) # p val 2000
    print(fit2[0]) # Slope 2015
    print(fit2[4]) # p val 2015

Morocco
-0.7869330529154128
0.009736300239490831
-0.7454362309175601
0.009866637547273537
Mali
-1.1576880009061385
0.015927122880549618
-0.9942876338676249
0.012196160970054973
Senegal
-0.6598231486799426
0.019795314722669568
-0.6522203225202147
0.0162683266408574
Sierra Leone
-0.7558727435045245
0.017370543541014423
-0.7098658774383695
0.013036950397134931
Ghana
-0.8905044486821306
0.014734008502443881
-0.8181063857298773
0.013634865207040088
Nigeria
-0.7157451424470835
0.01281735304379978
-0.6190849944196499
0.011448342951896817
Central African Republic
-1.0205556054838338
0.043797433635911445
-1.0107516853661755
0.036985258497378416
South Sudan
-0.775759376005855
0.04585770570595824
-0.7602120762046335
0.03531850230135764
Uganda
-0.9987033253032724
0.02209641494688064
-0.9095042402436657
0.024178537494585924
Kenya
-0.7220487907607783
0.006803751911797741
-0.6538601668808498
0.008370600383722466
Ethiopia
-0.9615909514238148
0.022330118281020352
-0.865144294667185
0.017887224528347404

In [165]:
bigcity = GHS2015[GHS2015.PopTot >= 5*10**6]

city = GHS2000[GHS2000['country'] == 'Tanzania']
city.sort_values('PopTot', ascending = False)

Unnamed: 0,osm_id,FID,country,city,osm_type,lat,lon,PopTot,aez_class,dataset,region,rain_zone,geometry
3111,258168998,36510,Tanzania,Kibaha,town,-6.786432,38.992017,2.256449e+06,Tropic - warm / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((39.13849149360179 -6.621131756725365...
2702,387793210,33743,Tanzania,Marangu,town,-3.283763,37.522982,5.028294e+05,Tropic - cool / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((37.28576044075393 -3.146474895606659...
2701,282575938,33692,Tanzania,Usa River,town,-3.372322,36.857871,4.889412e+05,Tropic - cool / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((36.63946698767863 -3.275955701122549...
3912,2147483647,32791,Tanzania,Mwanza,city,-2.517624,32.898452,3.772602e+05,Tropic - warm / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((32.89937489414467 -2.450614780344544...
3099,27252328,35967,Tanzania,Bububu,town,-6.098863,39.220446,3.506060e+05,Tropic - warm / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,"POLYGON ((39.22279677149679 -6.06987266015725,..."
2706,965845175,34988,Tanzania,Ujiji,town,-4.908373,29.692033,2.033541e+05,Tropic - warm / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((29.67207021479413 -4.789805763174645...
3902,252592807,36434,Tanzania,Morogoro,city,-6.816163,37.669389,1.967321e+05,Tropic - warm / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((37.65220879823043 -6.758982601449925...
3905,255594923,37530,Tanzania,Mbeya,city,-8.906508,33.468672,1.935228e+05,Tropic - cool / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,"POLYGON ((33.4575119862172 -8.869351901661778,..."
3906,255615843,35143,Tanzania,Tanga,city,-5.072198,39.099346,1.927514e+05,Tropic - warm / subhumid,GHS-Pop 2000,Eastern_Africa,Sub-humid,POLYGON ((39.04082123596215 -5.048977728340151...
3901,87841198,35946,Tanzania,Dodoma,city,-6.179118,35.746817,1.753363e+05,Tropic - warm / semiarid,GHS-Pop 2000,Eastern_Africa,Semi-arid,POLYGON ((35.73979729138311 -6.102293494791687...


# Old Code

In [None]:
# test = GHS2000[GHS2000['PopTot'] < 5*10**6]
# out = test.groupby('country').PopTot.agg(gini)
# gini2015_df['gini-00'] = out.values

In [None]:
# gini2015_df = pd.DataFrame()
# gini2015_df['country'] = out.keys()
# gini2015_df['gini-15'] = out.values

In [None]:
# out.keys()

In [None]:
# gini2015_df['gini-dif'] = gini2015_df['gini-15'] - gini2015_df['gini-00']

In [None]:
# gini2015_df

In [None]:
# fao_ag = pd.read_csv(data_temp+'Macro-Statistics_Key_Indicators_E_Africa.csv')

In [None]:
# Value added as share gpd 2010 prices

# value = fao_ag.loc[(fao_ag['Item Code'] == 22016) & (fao_ag['Element Code'] == 6169)]

In [None]:
# ag2000_df = value.filter(['Area', 'Y2000'])
# ag2000_df.columns = ag_df.columns.str.replace('Area', 'country')

In [None]:
# ag_result = pd.merge(ag2000_df, ag2015_df, on='country', how='inner')
# ag_result

In [None]:
# result = pd.merge(gini_df, ag_df, on='country', how='inner')

In [None]:
# plt.scatter(result.gini, result.Y2015, marker='.', color='Red', s=100, alpha = 0.5)
# plt.xlabel('Urban Settlement Gini Coeff by Country 2015')
# plt.ylabel('Value added by Agriculture as Share of GDP')