# NoteBook to Make Descriptives

This notebook is for looking at final population zonal stats data

In [1]:
from rasterstats import zonal_stats
import rasterio
import geopandas as gpd
import operator
import numpy as np
import pandas as pd
import matplotlib 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from functools import reduce

# Functions

In [17]:
def city_search(gpd_df, city_list, country):
    """ function will print out city name and population for a subset of a gridded dataset gpd
    requires data frame, list of cities, and country of interest
    """
    gpd_df_sub = gpd_df[gpd_df['country'] == country]
    
    for index, row in gpd_df_sub.iterrows():
        for city in city_list:
            if row['city'] == city:
                print(city)
                print(round(row['PopTot']))

In [37]:
senegal = WP2015[WP2015['country'] == 'Senegal']
senegal.tail(50)

Unnamed: 0,osm_id,FID,country,city,osm_type,lat,lon,PopTot,aez_class,geometry
3073,249744563,12386,Senegal,Matam,town,15.656563,-13.255916,23691.03,Tropic - warm / semiarid,"POLYGON ((-13.242084 15.64125, -13.242084 15.6..."
3074,281177216,12076,Senegal,Pout,town,14.771693,-17.059435,17461.22,Tropic - warm / semiarid,"POLYGON ((-17.050417 14.757917, -17.050417 14...."
3075,281182572,12286,Senegal,Kébémer,town,15.368873,-16.442978,17117.18,Tropic - warm / semiarid,"POLYGON ((-16.467084 15.382917, -16.458751 15...."
3076,281184145,12213,Senegal,Meckhe,town,15.112955,-16.631121,24617.33,Tropic - warm / semiarid,"POLYGON ((-16.625417 15.09125, -16.625417 15.0..."
3077,281184756,12152,Senegal,Tivaouane,town,14.951507,-16.812868,56074.3,Tropic - warm / semiarid,"POLYGON ((-16.825417 14.91625, -16.825417 14.9..."
3078,288544251,12205,Senegal,Richard-Toll,town,16.466294,-15.688449,63569.83,Tropic - warm / arid,"POLYGON ((-15.725417 16.499584, -15.708751 16...."
3079,292135098,12294,Senegal,Linguère,town,15.396092,-15.115351,12376.68,Tropic - warm / semiarid,"POLYGON ((-15.142084 15.407917, -15.133751 15...."
3080,292939503,12063,Senegal,Bambey,town,14.69548,-16.449262,128105.5,Tropic - warm / semiarid,"POLYGON ((-16.442084 14.66625, -16.442084 14.6..."
3081,292948744,11831,Senegal,Guinguinéo,town,14.272179,-15.945573,13957.83,Tropic - warm / semiarid,"POLYGON ((-15.933751 14.257917, -15.933751 14...."
3082,293437170,12083,Senegal,Mbacké,town,14.797122,-15.906661,847819.3,Tropic - warm / semiarid,"POLYGON ((-15.850417 14.682917, -15.850417 14...."


In [None]:
def str_id(gpd_df):
    """
    Function makes a new col with a unique lat-lon string to identify each osm point & drops duplicates
    Function also makes
    """

    
    print(gpd_df.shape)
    
    lat_string = gpd_df.lat.astype(str)
    lon_string = gpd_df.lon.astype(str)
    gpd_df['str_id'] = lat_string.astype(str)+lon_string.astype(str)
    
    print(gpd_df.shape)


In [14]:
def dup_drop(gpd_in, col, keep_dup):
    """ 
    function drops duplicates based on a column from a pd data frame
    requires pd df out string, pd df, col name, and which dup to keep
    returns new gpd_df
    """
    
    gpd_out = gpd.GeoDataFrame()
    
    print(gpd_in.shape)
    
    gpd_out = gpd_in.drop_duplicates(col, keep = keep_dup)
    
    print(gpd_out.shape)

    return gpd_out

    

In [None]:
def df_merge(df_left, df_right):
    
    pd_out = pd.DataFrame()
    pd_out = pd.merge(df_left[['str_id', 'PopTot']], df_right[['str_id', 'PopTot']], on='str_id', how = 'inner')
    
    left_pop = pd_out.PopTot_x.astype(str)
    right_pop = pd_out.PopTot_y.astype(str)
    pd_out['pop_id'] = left_pop.astype(str)+right_pop.astype(str)

    return pd_out

In [None]:
def p_X_gt_x(data, X=None):
    n_data = len(data)
    if X is None:
        X = data.unique()
    return X, pd.Series([sum(data>=x)/n_data for x in X ])

def p_X_lt_x(data, X=None):
    n_data = len(data)
    if X is None:
        X = data.unique()
    return X, pd.Series([sum(data<=x)/n_data for x in X ])

# Data

In [3]:
# File paths

data_raw = '/Users/cascade/Github/NTL/data/raw/'
data_temp = '/Users/cascade/Github/NTL/temp_data/'
data_interim = '/Users/cascade/Github/NTL/data/interim/'
ms_data = '/Users/cascade/Github/NTL/temp_data/MS_Data/'
erl_data = '/Users/cascade/Github/NTL/temp_data/ERL_data/'
downloads = '/Users/cascade/Downloads/'

In [4]:
# Data

GHS2000 = gpd.read_file(erl_data+'GHS_POP_GPW42000_final20190122.shp')
GHS2015 = gpd.read_file(erl_data+'GHS_POP_GPW42015_final20190122.shp')
WP2000 = gpd.read_file(erl_data+'AFR_PPP_2000_adj_v2_final20190122.shp')
WP2015 = gpd.read_file(erl_data+'AFR_PPP_2015_adj_v2_final20190122.shp')
LS2015 = gpd.read_file(erl_data+'WPE_1KM_2016_final20190122.shp')
WPE2016 = gpd.read_file(erl_data+'LS15_final20190122.shp')

In [None]:
# List of datasets

datasets_in = [GHS2000, GHS2015, WP2000, WP2015, LS2015, WPE2016]

# Group Data

In [15]:
# Drop Doubles based on OSM Lat/Lon

GHS2000_drop = dup_drop(GHS2000, 'osm_id', 'first')
WP2000_drop = dup_drop(WP2000, 'osm_id', 'first')

GHS2015_drop = dup_drop(GHS2015, 'osm_id', 'first')
WP2015_drop = dup_drop(WP2015 , 'osm_id', 'first')
LS2015_drop = dup_drop(LS2015 , 'osm_id', 'first')
WPE2016_drop = dup_drop(WPE2016 , 'osm_id', 'first')

(5854, 10)
(4293, 10)
(2061, 10)
(1714, 10)
(6229, 10)
(4530, 10)
(2411, 10)
(1990, 10)
(5598, 10)
(4087, 10)
(5552, 10)
(4090, 10)


In [None]:
GHS2015_pop = GHS2015_drop[['str_id','PopTot']]
WP2015_pop = WP2015_drop[['str_id','PopTot']]
LS2015_pop = LS2015_drop[['str_id','PopTot']]
WPE2016_pop = WPE2016_drop[['str_id','PopTot']]

data_frames = [GHS2015_pop, WP2015_pop, LS2015_pop, WPE2016_pop]

df2015_merged = reduce(lambda  left,right: pd.merge(left,right,on=['str_id'], how='inner'), data_frames)

print(len(df2015_merged))
df2015_merged.head(4)



In [None]:
df2015_merged.columns = ['str_id', 'GHS15_Pop', 'WP15_Pop', 'LS15_Pop', 'WPE2016_pop' ]
df2015_merged.head()

In [None]:
df_2015_GHS_WP = df2015_merged[['WPE2016_pop','LS15_Pop']]
print(df_2015_GHS_WP.shape)


df_2015_GHS_WP = df_2015_GHS_WP.drop_duplicates(keep='first')
print(df_2015_GHS_WP.shape)


In [None]:
stats.pearsonr(df_2015_GHS_WP['WPE2016_pop'], df_2015_GHS_WP['LS15_Pop'])

In [None]:
df_2015_GHS_WP.dtypes

In [None]:
# Pairwise scatter plots

fig = plt.figure()
ax = plt.gca()
ax.scatter(df_2015_GHS_WP['WPE2016_pop'], df_2015_GHS_WP['LS15_Pop'] , c='blue', alpha=0.05, edgecolors='none')
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlabel('ESRI - WPE 2016')
plt.ylabel('LandScan 2015')

#fig.savefig('/Users/cascade/Desktop/'+'WPE16-LS15.png', dpi=700)

In [None]:
#axl = pd.scatter_matrix(df2015_merged, alpha = 0.3, figsize = (14,8), diagonal = 'kde')

import seaborn as sns
corr = df2015_merged.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
import matplotlib.pyplot as plt

#matplotlib.pyplot.scatter(df_merge['PopTot_x'], df_merge['PopTot_y'])

fig = plt.figure()
ax = plt.gca()
ax.scatter(test_df_drop['PopTot_x'], test_df_drop['PopTot_y'] , c='blue', alpha=0.05, edgecolors='none')
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlabel('WPE 2016')
plt.ylabel('World Pop 2015')

In [None]:
fig = plt.figure()
ax = plt.gca()
ax.scatter(test_df_drop['x'], test_df_drop['y'], c='blue', alpha=0.1, edgecolors='none')
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlabel('WPE 2016')
plt.ylabel('World Pop 2015')

In [None]:
from scipy import stats
xA = df_merge['PopTot_x']
yA = df_merge['PopTot_y']

result = stats.pearsonr(xA, yA) # return is (Pearson’s correlation coefficient, 2-tailed p-value)
result

In [None]:
from scipy import stats
xR = test_df_drop['x']
yR = test_df_drop['y']

result = stats.pearsonr(xR, yR) # return is (Pearson’s correlation coefficient, 2-tailed p-value)
result

In [None]:
test_df_min = test_df_drop[test_df_drop['x']<=1000000]
print(len(test_df_min))
test_df_min = test_df_drop[test_df_drop['y']<=1000000]
print(len(test_df_min))

In [None]:
test_df_min[test_df_min['y']>=1000000]

In [None]:
xW = test_df_min['x']
yW = test_df_min['y']

result = stats.pearsonr(xW, yW) # return is (Pearson’s correlation coefficient, 2-tailed p-value)
result

In [None]:
fig = plt.figure()
ax = plt.gca()
ax.scatter(test_df_min['x'], test_df_min['y'], c='blue', alpha=0.1, edgecolors='none')
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlabel('WPE 2016')
plt.ylabel('World Pop 2015')

# Distrubutions

In [None]:
# WP 2000 & 2015 Bar Plots Chunk by City Size

# drop FID
print(len(WP2015))
WP2015 = WP2015.drop_duplicates('FID', keep = 'first')
print(len(WP2015))

# drop <5000
WP2015 = WP2015[WP2015['PopTot'] > 5000]
print(len(WP2015))

In [None]:
WP2015_Nigeria = WP2015[WP2015['country']=='Nigeria']
WP2000_Nigeria = WP2000[WP2000['country']=='Nigeria']

print(len(WP2015_Nigeria))
print(len((WP2000_Nigeria)))

In [None]:
# 2015 Chunks
WP2015_Nigeria_50k = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(WP2015_Nigeria_50k)

WP2015_Nigeria_100k = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] > 5*10**4) & (WP2015_Nigeria['PopTot'] <= 10**5), 'PopTot'].sum()
print(WP2015_Nigeria_100k)

WP2015_Nigeria_250k = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] > 10**5) & (WP2015_Nigeria['PopTot'] <= 2.5*10**5), 'PopTot'].sum()
print(WP2015_Nigeria_250k)

WP2015_Nigeria_500k = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] > 2.5*10**5) & (WP2015_Nigeria['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(WP2015_Nigeria_500k)

WP2015_Nigeria_1m = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] > 5*10**5) & (WP2015_Nigeria['PopTot'] <= 10**6), 'PopTot'].sum()
print(WP2015_Nigeria_1m)

WP2015_Nigeria_5m = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] > 10**6) & (WP2015_Nigeria['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(WP2015_Nigeria_5m)

WP2015_Nigeria_5mplus = WP2015_Nigeria.loc[(WP2015_Nigeria['PopTot'] > 5*10**6) & (WP2015_Nigeria['PopTot'] <= 10*10**6), 'PopTot'].sum()
print(WP2015_Nigeria_5mplus)

WP2000_Nigeria_50k = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(WP2000_Nigeria_50k)

WP2000_Nigeria_100k = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] > 5*10**4) & (WP2000_Nigeria['PopTot'] <= 10**5), 'PopTot'].sum()
print(WP2000_Nigeria_100k)

WP2000_Nigeria_250k = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] > 10**5) & (WP2000_Nigeria['PopTot'] <= 2.5*10**5), 'PopTot'].sum()
print(WP2000_Nigeria_250k)

WP2000_Nigeria_500k = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] > 2.5*10**5) & (WP2000_Nigeria['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(WP2000_Nigeria_500k)

WP2000_Nigeria_1m = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] > 5*10**5) & (WP2000_Nigeria['PopTot'] <= 10**6), 'PopTot'].sum()
print(WP2000_Nigeria_1m)

WP2000_Nigeria_5m = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] > 10**6) & (WP2000_Nigeria['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(WP2000_Nigeria_5m)

WP2000_Nigeria_5mplus = WP2000_Nigeria.loc[(WP2000_Nigeria['PopTot'] > 5*10**6), 'PopTot'].sum()
print(WP2000_Nigeria_5mplus)

In [None]:
# 2015 Chunks
WP2015_50k = WP2015.loc[(WP2015['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(WP2015_50k)

WP2015_100k = WP2015.loc[(WP2015['PopTot'] > 5*10**4) & (WP2015['PopTot'] <= 10**5), 'PopTot'].sum()
print(WP2015_100k)

WP2015_250k = WP2015.loc[(WP2015['PopTot'] > 10**5) & (WP2015['PopTot'] <= 2.5*10**5), 'PopTot'].sum()
print(WP2015_250k)

WP2015_500k = WP2015.loc[(WP2015['PopTot'] > 2.5*10**5) & (WP2015['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(WP2015_500k)

WP2015_1m = WP2015.loc[(WP2015['PopTot'] > 5*10**5) & (WP2015['PopTot'] <= 10**6), 'PopTot'].sum()
print(WP2015_1m)

WP2015_5m = WP2015.loc[(WP2015['PopTot'] > 10**6) & (WP2015['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(WP2015_5m)

WP2015_5mplus = WP2015.loc[(WP2015['PopTot'] > 5*10**6) & (WP2015['PopTot'] <= 10*10**6), 'PopTot'].sum()
print(WP2015_5mplus)

In [None]:
32201707.237106323 +25449623.476852417 +46911857.434947185 +34533143.84853259 +44698553.905130506 +112615952.7138133 + 168529234.33390665 

In [None]:
# 2000 Chunks
WP2000_50k = WP2000.loc[(WP2000['PopTot'] <= 5*10**4), 'PopTot'].sum()
print(WP2000_50k)

WP2000_100k = WP2000.loc[(WP2000['PopTot'] > 5*10**4) & (WP2000['PopTot'] <= 10**5), 'PopTot'].sum()
print(WP2000_100k)

WP2000_250k = WP2000.loc[(WP2000['PopTot'] > 10**5) & (WP2000['PopTot'] <= 2.5*10**5), 'PopTot'].sum()
print(WP2000_250k)

WP2000_500k = WP2000.loc[(WP2000['PopTot'] > 2.5*10**5) & (WP2000['PopTot'] <= 5*10**5), 'PopTot'].sum()
print(WP2000_500k)

WP2000_1m = WP2000.loc[(WP2000['PopTot'] > 5*10**5) & (WP2000['PopTot'] <= 10**6), 'PopTot'].sum()
print(WP2000_1m)

WP2000_5m = WP2000.loc[(WP2000['PopTot'] > 10**6) & (WP2000['PopTot'] <= 5*10**6), 'PopTot'].sum()
print(WP2000_5m)

WP2000_5mplus = WP2000.loc[(WP2000['PopTot'] > 5*10**6) (WP2000['PopTot'] <= 10*10**6), 'PopTot'].sum()
print(WP2000_5mplus)

In [None]:
# import plotly.plotly as py
# import plotly.tools as tls

import matplotlib.pyplot as plt

dictionary = plt.figure()

bar_leg = ['WorldPop 2015', 'WorldPop 2000']


WP2015_bar = {'<50K':WP2015_50k, '50-100K': WP2015_100k, '100-150K':WP2015_250k,
    '250-500K' : WP2015_500k, '500K-1m' : WP2015_1m, '1-5M' : WP2015_5m, '5-10M' : WP2015_5mplus
             }

WP2000_bar = {u'Label0':WP2000_50k, u'Label1': WP2000_100k, u'Label2':WP2000_250k,
    u'Label3' : WP2000_500k, u'Label4' : WP2000_1m, u'Label5' : WP2000_5m, u'Label6' : WP2000_5mplus
    }

# make plot
sns.set(font_scale=2.2)

plt.bar(range(len(WP2000_bar)), WP2000_bar.values(), align='center', alpha  = 0.5, color = 'Blue')
plt.bar(range(len(WP2015_bar)), WP2015_bar.values(), align='center', alpha = 0.5, color = 'Red')
plt.legend(bar_leg,loc=2)



#plt.bar(range(len(D)), D.values(), align='center')

plt.xticks(range(len(WP2015_bar)), WP2015_bar.keys())

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
plt.gca().set_yscale('log')
fig.savefig('/Users/cascade/Desktop/'+'WP152000_bar.png', dpi=700)

In [None]:
# import plotly.plotly as py
# import plotly.tools as tls

import matplotlib.pyplot as plt

dictionary = plt.figure()

bar_leg = ['WorldPop 2015', 'WorldPop 2000']


WP2015_Nigeria_bar = {'<50K':WP2015_Nigeria_50k, '50-100K': WP2015_Nigeria_100k, '100-150K':WP2015_Nigeria_250k,
    '250-500K' : WP2015_Nigeria_500k, '500K-1m' : WP2015_Nigeria_1m, '1-5M' : WP2015_Nigeria_5m, '>5M' : WP2015_Nigeria_5mplus
             }

WP2000_Nigeria_bar = {u'Label0':WP2000_Nigeria_50k, u'Label1': WP2000_Nigeria_100k, u'Label2':WP2000_Nigeria_250k,
    u'Label3' : WP2000_Nigeria_500k, u'Label4' : WP2000_Nigeria_1m, u'Label5' : WP2000_Nigeria_5m, u'Label6' : WP2000_Nigeria_5mplus
    }

# make plot
sns.set(font_scale=2.2)

plt.bar(range(len(WP2000_Nigeria_bar)), WP2000_Nigeria_bar.values(), align='center', alpha  = 0.5, color = 'Purple')
plt.bar(range(len(WP2015_Nigeria_bar)), WP2015_Nigeria_bar.values(), align='center', alpha = 0.5, color = 'Orange')
plt.legend(bar_leg,loc=2)



#plt.bar(range(len(D)), D.values(), align='center')

plt.xticks(range(len(WP2015_Nigeria_bar)), WP2015_Nigeria_bar.keys())

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)
plt.gca().set_yscale('log')
fig.savefig('/Users/cascade/Desktop/'+'WP152000_Nigeria_bar.png', dpi=700)

In [9]:
# WP 2000 & 2000 Bar Plots Chunk by City Size

# drop FID
print(len(WP2000))
WP2000 = WP2000.drop_duplicates('FID', keep = 'first')
print(len(WP2000))

# drop <5000
WP2000 = WP2000[WP2000['PopTot'] > 5000]
print(len(WP2000))

3152
2259
2061


In [10]:
WP2000_aez = WP2000[WP2000['aez_class'] != '0']
WP2000_aez = WP2000_aez[WP2000_aez['aez_class'] != 'NoClass']

# ax = sns.boxplot(x = 'PopTot', y = 'aez_class', data = WP2000_aez)
# ax.set(xscale="log")

# fig = matplotlib.pyplot.gcf()
# fig.set_size_inches(18.5, 10.55)
# plt.xlabel('Population')
# plt.ylabel('')
# plt.title('WP2000 Distribution by AEZ')

# # fig.savefig('test2png.png', dpi=100)

In [12]:
round(WP2000_aez.groupby('aez_class')['PopTot'].count())

aez_class
Subtropic - cool / arid         18
Subtropic - cool / semiarid     96
Subtropic - cool / subhumid     33
Subtropic - warm / arid        142
Subtropic - warm / semiarid    125
Subtropic - warm / subhumid    119
Tropic - cool / arid             9
Tropic - cool / humid           55
Tropic - cool / semiarid       109
Tropic - cool / subhumid       154
Tropic - warm / arid           120
Tropic - warm / humid          212
Tropic - warm / semiarid       390
Tropic - warm / subhumid       475
Name: PopTot, dtype: int64

In [None]:
WP2000_aez = WP2000[WP2000['aez_class'] != '0']
WP2000_aez = WP2000_aez[WP2000_aez['aez_class'] != 'NoClass']

# ax = sns.boxplot(x = 'PopTot', y = 'aez_class', data = WP2000_aez)
# ax.set(xscale="log")

# fig = matplotlib.pyplot.gcf()
# fig.set_size_inches(18.5, 10.55)
# plt.xlabel('Population')
# plt.ylabel('')
# plt.title('WP2000 Distribution by AEZ')

#fig.savefig('/Users/cascade/Desktop/'+'WP2000_aez.png', dpi=700)


In [None]:
round(WP2015_aez.groupby('aez_class')['PopTot'].mean())

In [None]:
# add col for concat, drop FID duplicates

datasets_string = ['GHS2000', 'GHS2015', 'WP2000', 'WP2015', 'LS2015', 'WPE2016']

# for i, dataset in enumerate(datasets_in):
    
#     dataset['dataset'] = datasets_string[i]
#     dataset = dup_drop(dataset, 'FID', 'first')

WPE2016['dataset'] = 'WPE2016'
WPE2016 = dup_drop(WPE2016, 'FID', 'first')

In [None]:
# drop cities with less than 5000 
GHS2015 = GHS2015[GHS2015['PopTot']>=5000] 
WP2015 = WP2015[WP2015['PopTot']>=5000] 
LS2015 = LS2015[LS2015['PopTot']>=5000] 
WPE2016 = WPE2016[WPE2016['PopTot']>=5000] 


datasets2015 = [GHS2015, WP2015, LS2015, WPE2016]


datasets15_concat = pd.concat(datasets2015, ignore_index=True)

# datasets_concat = pd.concat(datasets_in, ignore_index=True)

In [None]:
datasets15_concat.shape

In [None]:
min(WPE2016['PopTot'])

In [None]:
sns.set(font_scale=3)
ax = sns.boxplot(x = 'PopTot', y = 'dataset', data = datasets15_concat)
ax.set(xscale="log") 
plt.xlabel('Population')
plt.ylabel('')

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5)

#fig.savefig('/Users/cascade/Desktop/'+'data15-hist.png', dpi=700)

In [None]:
datasets2015 = [GHS2015, WP2015, LS2015, WPE2016]


GHS2015_sort = GHS2015['PopTot'].sort_values()
X0, y0 = p_X_gt_x(GHS2015_sort)

WP2015_sort = WP2015['PopTot'].sort_values()
X1, y1 = p_X_gt_x(WP2015_sort)

LS2015_sort = LS2015['PopTot'].sort_values()
X2, y2 = p_X_gt_x(LS2015_sort )

WPE2016_sort = WPE2016['PopTot'].sort_values()
X3, y3 = p_X_gt_x(WPE2016_sort)


In [None]:
fig = plt.figure()
ax = plt.subplot()
ax.plot(X0, y0, label ='GHS2015')
ax.plot(X1, y1, label ='WP2015')
ax.plot(X2, y2, label ='LS2015')
ax.plot(X3, y3, label ='WPE2016')
plt.xscale('log')
plt.title('Inverse cumulative distribution of city sizes by dataset')
plt.xlabel('x(Population)')
plt.ylabel('P[X>=x]')
ax.legend()
fig.set_size_inches(18.5, 10.5)


In [None]:
#GHS2000.loc[(GHS2000['PopTot'] > 5000) & (GHS2000['PopTot'] < 10000), 'PopTot'].sum()
test = GHS2015[GHS2015['PopTot'] <5000000]

a =test.loc[(test['PopTot'] > 1000000), 'PopTot'].sum()
                                         
                                          
#GHS2000.loc[GHS2000['PopTot'] <50000 & GHS2000['PopTot'] >5000, 'PopTot'].sum()

In [None]:
import numpy as np
import matplotlib.pyplot as pp
val = 0. # this is the value where you want the data to appear on the y-axis.
 # just as an example array
pp.plot(a, 'x')
pp.show()

In [None]:
WPE2016.head()

a = GHS2015[GHS2015['PopTot']>0]
a = GHS2015[GHS2015['PopTot']<1000000]
b = GHS2000[GHS2000['PopTot']>0]
b = GHS2000[GHS2000['PopTot']<1000000]


plt.hist(np.log10(b['PopTot']), alpha=0.5, label='WP2000')
plt.hist(np.log10(a['PopTot']), alpha=0.5, label='WP2015')
#plt.hist(np.log10(LS2015['PopTot']), alpha=0.5, label='x')
#plt.hist(np.log10(a['PopTot']), alpha=0.5, label='x')



plt.legend(loc='upper right')
# plt.show()


In [None]:
min(GHS2015['PopTot'])

# Old Code

In [None]:
# df_test_a = WP2015[['osm_id', 'PopTot']]


# df_test_b = GHS2015[['osm_id', 'PopTot']]
# print(df_test_b.shape)
# print(df_test_a.shape)

# df_test_a_drop = df_test_a.drop_duplicates('osm_id', keep=False)
# df_test_b_drop = df_test_b.drop_duplicates('osm_id', keep=False)
# print(df_test_b_drop.shape)
# print(df_test_a_drop.shape)

In [None]:
# find duplicates

# dupsA = pd.concat(g for _, g in test_df.groupby("str_id") if len(g) > 1)
# dupsA

In [None]:
# x = df_merge['PopTot_x']
# y = df_merge['PopTot_y']
# str_id_merge = df_merge['str_id']
# test_df = pd.DataFrame()
# test_df['x'] = x
# test_df['y'] = y
# test_df['str_id_merge'] = str_id_merge



# test_df['x_string'] = test_df.x.astype(str)
# test_df['y_string'] = test_df.y.astype(str)
# test_df['str_id'] = test_df.x_string.astype(str)+test_df.y_string.astype(str)

In [None]:
#axl = pd.scatter_matrix(df2015_merged, alpha = 0.3, figsize = (14,8), diagonal = 'kde')

# import seaborn as sns
# corr = df2015_merged.corr()
# sns.heatmap(corr, 
#             xticklabels=corr.columns.values,
#             yticklabels=corr.columns.values)

In [None]:
# import matplotlib.pyplot as plt

# #matplotlib.pyplot.scatter(df_merge['PopTot_x'], df_merge['PopTot_y'])

# fig = plt.figure()
# ax = plt.gca()
# ax.scatter(test_df_drop['PopTot_x'], test_df_drop['PopTot_y'] , c='blue', alpha=0.05, edgecolors='none')
# ax.set_yscale('log')
# ax.set_xscale('log')
# plt.xlabel('WPE 2016')
# plt.ylabel('World Pop 2015')

In [None]:
# fig = plt.figure()
# ax = plt.gca()
# ax.scatter(test_df_drop['x'], test_df_drop['y'], c='blue', alpha=0.1, edgecolors='none')
# ax.set_yscale('log')
# ax.set_xscale('log')
# plt.xlabel('WPE 2016')
# plt.ylabel('World Pop 2015')

In [None]:
# from scipy import stats
# xA = df_merge['PopTot_x']
# yA = df_merge['PopTot_y']

# result = stats.pearsonr(xA, yA) # return is (Pearson’s correlation coefficient, 2-tailed p-value)
# result

In [None]:
# from scipy import stats
# xR = test_df_drop['x']
# yR = test_df_drop['y']

# result = stats.pearsonr(xR, yR) # return is (Pearson’s correlation coefficient, 2-tailed p-value)
# result

In [None]:
# test_df_min = test_df_drop[test_df_drop['x']<=1000000]
# print(len(test_df_min))
# test_df_min = test_df_drop[test_df_drop['y']<=1000000]
# print(len(test_df_min))

In [None]:
# test_df_min[test_df_min['y']>=1000000]

In [None]:
# xW = test_df_min['x']
# yW = test_df_min['y']

# result = stats.pearsonr(xW, yW) # return is (Pearson’s correlation coefficient, 2-tailed p-value)
# result

In [None]:
# fig = plt.figure()
# ax = plt.gca()
# ax.scatter(test_df_min['x'], test_df_min['y'], c='blue', alpha=0.1, edgecolors='none')
# ax.set_yscale('log')
# ax.set_xscale('log')
# plt.xlabel('WPE 2016')
# plt.ylabel('World Pop 2015')