# City Size Distribution Analysis

This note book produces Lorenz Curves and Zipf Law Estimations, along with code to gini coefficents and zipf's alphas for all countries in the dataset. 

Code adapted for Lorzen curves from: https://zhiyzuo.github.io/Plot-Lorenz/

-Cascade Tuholske 2019-02-04

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.legend_handler import HandlerLine2D
from scipy import stats as ss

## Data In

In [None]:
# File paths

data_raw = '../../data/raw/'
data_temp = '../../temp_data/'
data_interim = '../../interim/'
data_analysis = '../../temp_data/ERL_data/Data20190222/'
erl_data = '../../temp_data/ERL_data/'
downloads = ''

In [None]:
# Data

# Load data: ...20190222.shp files have FIDs removed, S Sudan added, rainfall zones, and regions

GHS2000 = gpd.read_file(data_analysis+'GHS_POP_GPW42000_20190222.shp', driver = 'ESRI Shapefile')
GHS2015 = gpd.read_file(data_analysis+'GHS_POP_GPW42015_20190222.shp', driver = 'ESRI Shapefile')
WP2000 = gpd.read_file(data_analysis+'AFR_PPP_2000_adj_v2_20190222.shp', driver = 'ESRI Shapefile')
WP2015 = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_20190222.shp', driver = 'ESRI Shapefile')
LS2015 = gpd.read_file(data_analysis+'WPE_1KM_2016_20190222.shp', driver = 'ESRI Shapefile')
WPE2016 = gpd.read_file(data_analysis+'LS15_20190222.shp', driver = 'ESRI Shapefile')

In [None]:
# List of datasets

datasets_in = [GHS2000, GHS2015, WP2000, WP2015, LS2015, WPE2016]
datasets_in15 = [GHS2015, WP2015, LS2015, WPE2016]

## Functions

In [None]:
def gini(series):
    
    """Function calculates gini coefficent based on https://zhiyzuo.github.io/Plot-Lorenz/"""
    ## series to array
    arr = series.values
    
    ## first sort
    sorted_arr = arr.copy(np.array)
    sorted_arr.sort()
    n = arr.size
    coef_ = 2. / n
    const_ = (n + 1.) / n
    weighted_sum = sum([(i+1)*yi for i, yi in enumerate(sorted_arr)])
    return coef_*weighted_sum/(sorted_arr.sum()) - const_

In [None]:
def lorenz_curve(X):
    
    """Function plots Lorenz curves based on https://zhiyzuo.github.io/Plot-Lorenz/"""
    X_lorenz = X.cumsum() / X.sum()
    X_lorenz = np.insert(X_lorenz, 0, 0) 
    X_lorenz[0], X_lorenz[-1]
    fig, ax = plt.subplots(figsize=[6,6])
    ## scatter plot of Lorenz curve
    ax.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz, 
               marker='x', color='darkgreen', s=100)
    ## line plot of equality
    ax.plot([0,1], [0,1], color='k')

In [None]:
def gini_group(gpd_df, group, value, outcol):
    """Function calculates gini coef. for by groups for a pandas data frame. Returns data frame
    
    Args: gpd_df = geo pandas dataframe
          group = col of the group to calc gini
          value = col to calc gini
          outcol = name of colum to write out 
    """
    
    gini = test.groupby(group).value.agg(gini)
    df = pd.DataFrame()
    df[group] = out.keys()
    df[outcol] = out.values

    return df

In [None]:
def gini_alt(series):
    
    """Function calculates gini coefficent based on https://www.jstor.org/stable/pdf/177185.pdf 
    """
    
    arr = series.values
    
    # first sort
    sorted_arr = arr.copy(np.array)
    sorted_arr.sort()
    
    # parameters
    n = arr.size
    est_ = (n**2)*sorted_arr.mean()
    
    # equation
    sum([ (i+1) *yi  for i, yi in enumerate(sorted_arr)])
    
    weighted_sum = sum([(((2.* (i+1)) - (n) - 1) * yi) for i, yi in enumerate(sorted_arr)])
    
    # for unbiased estimator use
    # const_ = (n / (n - 1.))
    # return weighted_sum / est_ * const_
    
    return weighted_sum / est_

## Plots

Need to change area and/or group for each plot

In [None]:
# Select Data for Lorenz 

# All Africa

#area = 'Africa'
# X = np.sort(np.array(GHS2000.PopTot))
# Y = np.sort(np.array(GHS2015.PopTot))

# # By group
col = 'rain_zone' # column
area = 'Humid' # geography to distribut 

X = np.sort(np.array(GHS2000[GHS2000[col] == area].PopTot))
Y = np.sort(np.array(GHS2015[GHS2015[col] == area].PopTot))

print(len(X))
print(len(Y))

# Set Limits
X = X[(X < 5*10**6)]
Y = Y[(Y < 5*10**6)]

print(len(X))
print(len(Y))

# Data for curve one
X_lorenz = X.cumsum() / X.sum()
X_lorenz = np.insert(X_lorenz, 0, 0)
X_lorenz[0], X_lorenz[-1]

# Data for curve two 
Y_lorenz = Y.cumsum() / Y.sum()
Y_lorenz = np.insert(Y_lorenz, 0, 0)
Y_lorenz[0], Y_lorenz[-1]

In [None]:
df = pd.DataFrame()
df['pop'] = np.insert(Y, 0,0)
df['pct'] = Y_lorenz
df['rank'] = np.arange(Y_lorenz.size)/(Y_lorenz.size-1)
df.iloc[560:]

In [None]:
# Lorenz Curve Plot

%matplotlib inline

## Size & number
sns.set(font_scale=3)
fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))


# Curve Plots
# Alpha for Botswana and S. Leone is 0.7, else 0.5
ax1.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz, 
           marker='.', color='Green', s=100, alpha = 0.7)

ax1.scatter(np.arange(Y_lorenz.size)/(Y_lorenz.size-1), Y_lorenz, 
           marker='.', color='#ff01bc', s=100, alpha = 0.7)

# Title
plt.title(area)


# Legend
leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
plt.legend(leg,loc=2, markerscale=3)

## line plot of of 90% 40% 
# ax1.plot([0.9,0.9], [0,1] , color='k', alpha = 0.25)
# ax1.plot([0,1], [0.3,0.3] , color='k', alpha = 0.25)

#remove ticks
# ax.set_xticks([])
# ax.set_yticks([])

# Set Color
plt.grid(b=True, color = 'k', alpha = 0.5, marker = '.')

# plt.setp(ax1.xaxis.get_gridlines(), clip_path = [0,1])
# plt.setp(ax1.yaxis.get_gridlines(), clip_path = [[0,0], [1,1]])
plt.figure.frameon = True
ax1.set_facecolor('White')


fig.savefig(area+'_Lorenz.png', dpi=700)



In [None]:
# Select Data Zipf's law

# All Africa
# area = 'Africa'
# data1 = np.sort(np.array(GHS2000.PopTot))
# data2 = np.sort(np.array(GHS2015.PopTot))

# By country 
col = 'country' # column
area = 'Ethiopia' # geography to distribut 

data1 = np.sort(np.array(GHS2000[GHS2000[col] == area].PopTot))
data2 = np.sort(np.array(GHS2015[GHS2015[col] == area].PopTot))

print(len(data1))
print(len(data2))

# Set Limits
data1 = data1[(data1 < 5*10**6)]
data2 = data2[(data2 < 5*10**6)]

print(len(data1))
print(len(data2))

X1_zipf = data1 #sort the values
Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
Y1_zipf = Y1_zipf[::-1] # Re order range

X1_zipf_log = np.log(X1_zipf)
Y1_zipf_log = np.log(Y1_zipf)

X2_zipf = data2
Y2_zipf = list(range(1, len(X2_zipf)+1))
Y2_zipf = Y2_zipf[::-1]

X2_zipf_log = np.log(X2_zipf)
Y2_zipf_log = np.log(Y2_zipf)

In [None]:
# Zipf's law

## Size & number
sns.set(font_scale=3)
fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))

# Plot
plt.scatter(X1_zipf_log, Y1_zipf_log , marker='.', color='purple', s=100, alpha = 0.7)
plt.scatter(X2_zipf_log, Y2_zipf_log , marker='.', color='orange', s=100, alpha = 0.7)

# Legend
# leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
# plt.legend(leg,loc=1, markerscale=3, facecolor = 'white', edgecolor = 'white')

# Fit Lines
sns.regplot(X1_zipf_log, Y1_zipf_log, color = 'purple')
sns.regplot(X2_zipf_log, Y2_zipf_log, color = 'orange')

# Title
plt.title(area)

# Labels
plt.xlabel('')
plt.ylabel('')

# Set Ticks
plt.xticks([np.log(10**4), np.log(10**5), np.log(10**6), np.log(10**7)], 
           ['$10^4$', '$10^5$', '$10^6$', '$10^7$'])

plt.yticks([np.log(10), np.log(100), np.log(1000), np.log(10000),  np.log(100000)], 
           ['10', '$10^2$', '$10^3$', '$10^4$', '$10^5$'])

# Set Background Color
ax1.set_facecolor('White')

axes = plt.gca()
axes.set_xlim([7,16])
#axes.set_xlim([min(X2_zipf_log), max(X2_zipf_log)])
axes.set_ylim([1,8])

fig.savefig('/Users/cascade/Desktop/'+area+'_Zipf.png', dpi=700)

## Analysis

In [None]:
# Subset of Countries 
# countries = ['Morocco', 'Mali', 'Senegal', 'Sierra Leone', 'Ghana',
#             'Nigeria', 'Central African Republic', 'South Sudan',
#             'Uganda', 'Kenya', 'Ethiopia', 'Angola', 'Botswana',
#             'Zambia', 'South Africa']

# countries = ['Mali', 'Ghana', 'Nigeria', 'Kenya', 'Ethiopia', 'Botswana',
#             'Zambia', 'South Africa']

# All countries

countries = pd.Series(GHS2015.country, dtype="category")

countries = countries.cat.categories.tolist()
countries

In [None]:
## Descriptives and Zipfs Plots for all countries

data1 = GHS2000[GHS2000.PopTot < 5*10**6]
data2 = GHS2015[GHS2015.PopTot < 5*10**6]

# all countires

df = pd.DataFrame()

arr = []


for country in countries: 
    
    test1 = data1[data1['country'] == country]
    test2 = data2[data2['country'] == country]
    
    # counts
    num2000= test1.PopTot.count()
    num2015= test2.PopTot.count()
    
    #Median
    m2000= test1.PopTot.median()
    m2015= test2.PopTot.median()
    
    #gini
 
    g2000 = gini(test1.PopTot)
    g2015 = gini(test2.PopTot)

    #zipf
    X1_zipf = np.sort(test1.PopTot) #sort the values
    Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
    Y1_zipf = Y1_zipf[::-1] # Re order range

    X1_zipf_log = np.log(X1_zipf)
    Y1_zipf_log = np.log(Y1_zipf)

    X2_zipf = np.sort(test2.PopTot)
    Y2_zipf = list(range(1, len(X2_zipf)+1))
    Y2_zipf = Y2_zipf[::-1]

    X2_zipf_log = np.log(X2_zipf)
    Y2_zipf_log = np.log(Y2_zipf)
    
    fit1 = ss.linregress(X1_zipf_log, Y1_zipf_log)
    fit2 = ss.linregress(X2_zipf_log, Y2_zipf_log)

    #Fit

    s2000 = fit1[0] # Slope 2000
    p2000 = fit1[4] # p val 2000
    
    s2015 = fit2[0] # Slope 2015
    p2015 = fit2[4] # p val 2015
    
    # Make dataframe
    df[country] = (num2000, num2015, m2000, m2015, g2000, g2015, s2000, p2000, s2015, p2015)

    # Zipf's law

    ## Size & number
    sns.set(font_scale=3)
    fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))

    # Plot
    plt.scatter(X1_zipf_log, Y1_zipf_log , marker='.', color='purple', s=100, alpha = 0.7)
    plt.scatter(X2_zipf_log, Y2_zipf_log , marker='.', color='orange', s=100, alpha = 0.7)

    # Legend
    leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
    plt.legend(leg,loc=1, markerscale=3, facecolor = 'white', edgecolor = 'white')

    # Fit Lines
    sns.regplot(X1_zipf_log, Y1_zipf_log, color = 'purple')
    sns.regplot(X2_zipf_log, Y2_zipf_log, color = 'orange')

    # Title
    plt.title(country)

    # Labels
    plt.xlabel('')
    plt.ylabel('')

    # Set Ticks
    plt.xticks([np.log(10**4), np.log(10**5), np.log(10**6), np.log(10**7)], 
               ['$10^4$', '$10^5$', '$10^6$', '$10^7$'])

    plt.yticks([np.log(10), np.log(100), np.log(1000), np.log(10000),  np.log(100000)], 
               ['10', '$10^2$', '$10^3$', '$10^4$', '$10^5$'])

    # Set Background Color
    ax1.set_facecolor('White')

    axes = plt.gca()
    axes.set_xlim([7,16])
    #axes.set_xlim([min(X2_zipf_log), max(X2_zipf_log)])
    axes.set_ylim([1,8])

    ## Save Zipfs
    fig.savefig(country+'_Zipf.png', dpi=300)
    

In [None]:
df

In [None]:
# df.to_csv(data_temp+'ALLCOUNTRIES_TABLE2_ERL20190301.csv')

In [None]:
# Lorenz for All Countries

data1 = GHS2000[GHS2000.PopTot < 5*10**6]
data2 = GHS2015[GHS2015.PopTot < 5*10**6]

for country in countries: 
    
    test1 = data1[data1['country'] == country]
    test2 = data2[data2['country'] == country]

    X = np.sort(np.array(test1.PopTot))
    Y = np.sort(np.array(test2.PopTot))
    
    # Data for curve one
    X_lorenz = X.cumsum() / X.sum()
    X_lorenz = np.insert(X_lorenz, 0, 0)
    X_lorenz[0], X_lorenz[-1]

    # Data for curve two 
    Y_lorenz = Y.cumsum() / Y.sum()
    Y_lorenz = np.insert(Y_lorenz, 0, 0)
    Y_lorenz[0], Y_lorenz[-1]
    
    # Lorenz Curve Plot

    ## Size & number
    sns.set(font_scale=3)
    fig, ax1 = plt.subplots(nrows=1, ncols=1,  figsize=(10, 10))


    # Curve Plots
    # Alpha for Botswana and S. Leone is 0.7, else 0.5
    ax1.scatter(np.arange(X_lorenz.size)/(X_lorenz.size-1), X_lorenz, 
               marker='.', color='Green', s=100, alpha = 0.7)

    ax1.scatter(np.arange(Y_lorenz.size)/(Y_lorenz.size-1), Y_lorenz, 
               marker='.', color='#ff01bc', s=100, alpha = 0.7)

    # Title
    plt.title(country)


    # Legend
    leg = ['GHS-Pop 2000', 'GHS-Pop 2015']
    plt.legend(leg,loc=2, markerscale=3)

    # Set Color
    plt.grid(b=True, color = 'k', alpha = 0.5, marker = '.')

    # plt.setp(ax1.xaxis.get_gridlines(), clip_path = [0,1])
    # plt.setp(ax1.yaxis.get_gridlines(), clip_path = [[0,0], [1,1]])
    plt.figure.frameon = True
    ax1.set_facecolor('White')


    fig.savefig('+country+'_Lorenz.png', dpi=300)

In [None]:
# Decriptives

for country in countries: 
    
    print(country)

    test1 = GHS2000[GHS2000['country'] == country]
    test1 = test1[test1.PopTot <5*10**6]
    test2 = GHS2015[GHS2015['country'] == country]
    test2 = test2[test2.PopTot <5*10**6]
    
    print(test1.PopTot.count())
    print(test2.PopTot.count())
    print(test1.PopTot.median())
    print(test2.PopTot.median())
    
    # gini coef. 
    print(gini((test1.PopTot)))
    print(gini((test2.PopTot)))

In [None]:
for country in countries: 
    data1 = np.sort(np.array(GHS2000[GHS2000['country'] == country].PopTot))
    data2 = np.sort(np.array(GHS2015[GHS2015['country'] == country].PopTot))
    
    # Order the data for Zipf's law
    X1_zipf = data1 #sort the values
    Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
    Y1_zipf = Y1_zipf[::-1] # Re order range

    X1_zipf_log = np.log(X1_zipf)
    Y1_zipf_log = np.log(Y1_zipf)

    X2_zipf = data2
    Y2_zipf = list(range(1, len(X2_zipf)+1))
    Y2_zipf = Y2_zipf[::-1]

    X2_zipf_log = np.log(X2_zipf)
    Y2_zipf_log = np.log(Y2_zipf)
    
    fit1 = ss.linregress(X1_zipf_log, Y1_zipf_log)
    fit2 = ss.linregress(X2_zipf_log, Y2_zipf_log)

    print(country)
    print(fit1[0]) # Slope 2000
    print(fit1[2]) # R2 2000
    print(fit1[4]) # p val 2000
    print(fit2[0]) # Slope 2015
    print(fit2[2]) # R2 2000
    print(fit2[4]) # p val 2015

In [None]:
bigcity = GHS2015[GHS2015.PopTot >= 5*10**6]

city = GHS2000[GHS2000['country'] == 'Tanzania']
city.sort_values('PopTot', ascending = False)

In [None]:
# All Africa

test1 = GHS2000
test2 = GHS2015

print(test1.PopTot.count())
print(test2.PopTot.count())
print(test1.PopTot.median())
print(test2.PopTot.median())

# gini coef. 
print(gini((test1.PopTot)))
print(gini((test2.PopTot)))

In [None]:
# All Africa 

data1 = np.sort(np.array(GHS2000.PopTot))
data2 = np.sort(np.array(GHS2015.PopTot))

# Order the data for Zipf's law
X1_zipf = data1 #sort the values
Y1_zipf = list(range(1, len(X1_zipf)+1)) # make a range
Y1_zipf = Y1_zipf[::-1] # Re order range

X1_zipf_log = np.log(X1_zipf)
Y1_zipf_log = np.log(Y1_zipf)

X2_zipf = data2
Y2_zipf = list(range(1, len(X2_zipf)+1))
Y2_zipf = Y2_zipf[::-1]

X2_zipf_log = np.log(X2_zipf)
Y2_zipf_log = np.log(Y2_zipf)

fit1 = ss.linregress(X1_zipf_log, Y1_zipf_log)
fit2 = ss.linregress(X2_zipf_log, Y2_zipf_log)

print('Africa')
print(fit1[0]) # Slope 2000
print(fit1[2]) # R2 2000
print(fit1[4]) # p val 2000
print(fit2[0]) # Slope 2015
print(fit2[2]) # R2 2000
print(fit2[4]) # p val 2015

In [None]:
fit1

In [None]:
rain = GHS2015[GHS2015['rain_zone'] == 'Arid']
rain = rain[rain.PopTot < 5*10**6]

gini(rain.PopTot)