# ERL Analysis 2019-02-22

Code for fixing the South Sudan and missing GHS data. South Sudan, w/ FIDs duplicates and <5000 dropped
FIDS may be duplicated for GHS 2000 & 2015 because of the addition of South Sudan Data

-- Cascade Tuholske 2019-02-22

## Dependencies 

In [1]:
from rasterstats import zonal_stats
import rasterio
import geopandas as gpd
import operator
import numpy as np
import pandas as pd
import matplotlib 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from functools import reduce
import squarify

## Functions

In [2]:
def dup_drop(gpd_in, col, keep_dup):
    """ 
    function drops duplicates based on a column from a pd data frame
    requires pd df out string, pd df, col name, and which dup to keep
    returns new gpd_df
    """
    
    gpd_out = gpd.GeoDataFrame()
    
    print(gpd_in.shape)
    
    gpd_out = gpd_in.drop_duplicates(col, keep = keep_dup)
    
    print(gpd_out.shape)

    return gpd_out

## Data

In [3]:
# File Paths

data_raw = '../../data/raw/'
data_temp = '../../temp_data/'
data_interim = '../../interim/'
data_analysis = '/Users/cascade/Github/NTL/temp_data/ERL_data/Data20190222/'
downloads = '/Users/cascade/Downloads/'
desktop = '/Users/cascade/Desktop/'

In [None]:
# Load data: ...20190222.shp files have FIDs removed, S Sudan added, rainfall zones, and regions

GHS2000 = gpd.read_file(data_analysis+'GHS_POP_GPW42000_20190222.shp', driver = 'ESRI Shapefile')
GHS2000_New = gpd.read_file(data_analysis+'GHS_POP_GPW42000_urbanmerge_PopTot.shp')
GHS2000_Sudan = gpd.read_file(data_analysis+'GHS2000_1500c300_S_Sudan_polyoverlapPopTot.shp')

GHS2000_old = gpd.read_file(data_analysis+'GHS_POP_GPW42000_final20190122.shp')


In [None]:
datasets = [GHS2000, GHS2000_New, GHS2000_Sudan, GHS2000_old]

In [None]:
for dataset in datasets:
    print(len(dataset))

In [None]:
Sierra_old = GHS2000_old[GHS2000_old.country == 'Egypt']
Sierra_old = Sierra_old[Sierra_old.PopTot > 5000]
Sierra_old = dup_drop(Sierra_old, 'FID', 'first')

Sierra_old.sort_values('PopTot', ascending = False)

In [None]:
Sierra = GHS2000[GHS2000.country == 'Egypt']
Sierra.sort_values('PopTot', ascending = False)

In [None]:
poly_file = 'ERL_data/GHS_POP_GPW42000_urbanmerge'
poly_gpd = gpd.read_file(data_temp+poly_file+'.shp')

In [None]:
poly_gpd.head()

In [None]:
poly_gpd.crs = {'init': 'epsg:4326'}
poly_gpd = poly_gpd.to_crs({'init': 'esri:54009'})
poly_gpd.head()

In [None]:
poly_gpd.to_file(desktop+'GHS_POP_GPW42000_urbanmerge_54009.shp', driver='ESRI Shapefile')

### Stats for Tables

In [None]:
# country = 'Mali'
# dataset1 = GHS2000
# dataset2 = GHS2015

# test = dataset1[dataset1['country'] == country]
# test = test[test.PopTot <5*10**6]

# print(test.PopTot.count())
# print(test.PopTot.median())
# print(test.PopTot.mean())
# print(test.PopTot.sum())

# test = dataset2[dataset2['country'] == country]
# test = test[test.PopTot <5*10**6]

# print(test.PopTot.count())
# print(test.PopTot.median())
# print(test.PopTot.mean())
# print(test.PopTot.sum())

### Chunk Data

### Chunk Data

### Plots

## Old Code

### Functions

In [None]:
# # Load Data

# #GHS - Note, useing the merge files PopTot because I had to re-do the zonal stats 
# GHS2000 = gpd.read_file(data_analysis+'GHS_POP_GPW42000_urbanmerge_PopTot.shp')
# GHS2000_Sudan = gpd.read_file(data_analysis+'GHS2000_1500c300_S_Sudan_polyoverlapPopTot.shp')
# GHS2015 = gpd.read_file(data_analysis+'GHS_POP_GPW42015_urbanmerge_PopTot.shp')
# GHS2015_Sudan = gpd.read_file(data_analysis+'GHS2015_1500c300_S_Sudan_polyoverlapPopTot.shp')

# #WP
# WP2000 = gpd.read_file(data_analysis+'AFR_PPP_2000_adj_v2_final20190122.shp')
# WP2000_Sudan = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_S_Sudan_1500c300_polyoverlapPopTot.shp')
# WP2015 = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_final20190122.shp')
# WP2015_Sudan = gpd.read_file(data_analysis+'AFR_PPP_2015_adj_v2_S_Sudan_1500c300_polyoverlapPopTot.shp')


# #WPE
# WPE2016 = gpd.read_file(data_analysis+'WPE_1KM_2016_final20190122.shp')
# WPE2016_Sudan = gpd.read_file(data_analysis+'WPE_1KM_2016_Pop_Clip_S_Sudan_1500c300_polyoverlapPopTot.shp')

# #LS 
# LS2015 = gpd.read_file(data_analysis+'LS15_final20190122.shp')
# LS2015_Sudan = gpd.read_file(data_analysis+'LS15_w001001_Clip_S_Sudan_1500c300_polyoverlapPopTot.shp')

# print(len(GHS2000))
# print(len(GHS2000_Sudan))
# print(len(GHS2015))
# print(len(GHS2015_Sudan))
# print(len(WP2000))
# print(len(WP2000_Sudan))
# print(len(WP2015))
# print(len(WP2015_Sudan))
# print(len(WPE2016))
# print(len(WPE2016_Sudan))
# print(len(LS2015))
# print(len(LS2015_Sudan))


In [None]:
# # Drop FID duplicates ... recall that FIDs for GHS will not be unique for South Sudan Dataset and GHS All 

# GHS2000 = dup_drop(GHS2000, 'FID', 'first')
# GHS2000_Sudan = dup_drop(GHS2000_Sudan, 'FID', 'first')

# GHS2015 = dup_drop(GHS2015, 'FID', 'first')
# GHS2015_Sudan = dup_drop(GHS2015_Sudan, 'FID', 'first')

# WP2000 = dup_drop(WP2000, 'FID', 'first')
# WP2000_Sudan = dup_drop(WP2000_Sudan, 'FID', 'first')

# WP2015 = dup_drop(WP2015, 'FID', 'first')
# WP2015_Sudan = dup_drop(WP2015_Sudan, 'FID', 'first')

# WPE2016 = dup_drop(WPE2016, 'FID', 'first')
# WPE2016_Sudan = dup_drop(WPE2016_Sudan, 'FID', 'first')

# LS2015 = dup_drop(LS2015, 'FID', 'first')
# LS2015_Sudan = dup_drop(LS2015_Sudan, 'FID', 'first')

# print(len(GHS2000))
# print(len(GHS2000_Sudan))
# print(len(GHS2015))
# print(len(GHS2015_Sudan))
# print(len(WP2000))
# print(len(WP2000_Sudan))
# print(len(WP2015))
# print(len(WP2015_Sudan))
# print(len(WPE2016))
# print(len(WPE2016_Sudan))
# print(len(LS2015))
# print(len(LS2015_Sudan))

In [None]:
# Merge in South Sudan ... do each 
# print(len(GHS2015))
# print(len(GHS2015_Sudan))
# frames = [GHS2015, GHS2015_Sudan]

# GHS2015_merge = pd.concat(frames)
# print(len(GHS2015_merge))

In [None]:
# # Drop cities with > 5,000 people

# print(len(GHS2000_merge))
# GHS2000_merge = GHS2000_merge[GHS2000_merge.PopTot >= 5000]
# print(len(GHS2000_merge))

# print(len(GHS2015_merge))
# GHS2015_merge = GHS2015_merge[GHS2015_merge.PopTot >= 5000]
# print(len(GHS2015_merge))

# print(len(WP2000_merge))
# WP2000_merge = WP2000_merge[WP2000_merge.PopTot >= 5000]
# print(len(WP2000_merge))

# print(len(WP2015_merge))
# WP2015_merge = WP2015_merge[WP2015_merge.PopTot >= 5000]
# print(len(WP2015_merge))

# print(len(LS2015_merge))
# LS2015_merge = LS2015_merge[LS2015_merge.PopTot >= 5000]
# print(len(LS2015_merge))

# print(len(WPE2016_merge))
# WPE2016_merge = WPE2016_merge[WPE2016_merge.PopTot >= 5000]
# print(len(WPE2016_merge))

In [None]:
# Add dataset column 

# GHS2000_merge['dataset'] = 'GHS-Pop 2000'
# GHS2015_merge['dataset'] = 'GHS-Pop 2015'
# WP2000_merge['dataset'] = 'WorldPop 2000'
# WP2015_merge['dataset'] = 'WorldPop 2015'
# LS2015_merge['dataset'] = 'LandScan 2015'
# WPE2016_merge['dataset'] = 'WPE2016'

In [None]:
# datasets = [GHS2000_merge, GHS2015_merge, WP2000_merge, WP2015_merge, LS2015_merge, WPE2016_merge]

In [None]:
# Add Regions

### List of African Countries from the UN in OSM wiki

# Northern_Africa = (['Algeria', 'Egypt', 'Libya', 'Morocco', 'Tunisia', 'Western Sahara'], 'Northern_Africa')

# Eastern_Africa = ([
#     'Burundi',
#     'Comoros',
#     'Djibouti',
#     'Eritrea',
#     'Ethiopia',
#     'Kenya',
#     'Madagascar',
#     'Malawi',
#     'Mauritius',
#     #Mayotte,
#     'Mozambique',
#     'Réunion',
#     'Rwanda',
#     'Somalia',
#     'Sudan',
#     'South Sudan',
#     'Uganda',
#     'Tanzania',
#     'Zambia',
#     'Zimbabwe'], 'Eastern_Africa')
    
# Middle_Africa = ([
#     'Angola',
#     'Cameroon',
#     'Central African Republic',
#     'Chad',
#     'Congo-Brazzaville',
#     'Congo-Kinshasa',
#     'Equatorial Guinea',
#     'Gabon',
#     'Sao Tome and Principe'], 'Middle_Africa')
    
# Southern_Africa = ([
#     'Botswana',
#     'Lesotho',
#     'Namibia',
#     'South Africa',
#     'Swaziland'], 'Southern_Africa')
    
# Western_Africa = ([
#     'Benin',
#     'Burkina Faso',
#     'Cape Verde',
#     'Côte d\'Ivoire',
#     'Gambia',
#     'Ghana',
#     'Guinea',
#     'Guinea-Bissau',
#     'Liberia',
#     'Mali',
#     'Mauritania',
#     'Niger',
#     'Nigeria',
#     'Senegal',
#     'Sierra Leone',
#     'Togo'], 'Western_Africa')

In [None]:
# regions = [Northern_Africa, Western_Africa, Eastern_Africa, Southern_Africa, Middle_Africa]

In [None]:
# def region_col(gpd_df, regions_tuple, col_name_in, new_col):
#     """
#     Function searchs a col of a data frame and matches it with a list of 
#     tuples of which [0] countains a list of values to be cross referenced
#     & then makes a new col with the tuple [1] ... For example, you have a col with countries
#     and you want to make a new col listing the region that country is apart of
#     """
#     arr = []
#     for i, row in gpd_df.iterrows():
#         for region in regions_tuple:
#             for country in region[0]:
#                 if row[col_name_in] == country:
#                     name = region[1]
#                     arr.append(name)
#                     break
#     gpd_df[new_col] = arr
#     return gpd_df

In [None]:
# for dataset in datasets:
#     dataset = region_col(dataset, regions, 'country', 'region')

In [None]:
# Add Rainfall Zone

# Group by rainfall zone

# arid = (['Temperate / arid', 
#          'Subtropic - warm / arid', 
#          'Subtropic - cool / arid', 
#          'Tropic - warm / arid',
#          'Tropic - cool / arid'], 'Arid')

# semi_arid = (['Temperate / Semi-arid', 
#               'Subtropic - warm / semiarid', 
#               'Subtropic - cool / semiarid',
#               'Tropic - warm / semiarid', 
#               'Tropic - cool / semiarid'], 'Semi-arid')    

# sub_humid = (['Temperate / sub-humid', 
#               'Subtropic - warm / subhumid', 
#               'Subtropic - cool / subhumid',
#               'Tropic - warm / subhumid', 
#               'Tropic - cool / subhumid'], 'Sub-humid')

# humid = (['Temperate / humid', 
#           'Subtropic - warm / humid', 
#           'Subtropic - cool / humid', 
#           'Tropic - warm / humid',
#           'Tropic - cool / humid'], 'Humid')

# boreal = (['Boreal'], 'Boreal')

# na = (['NoClass', '0'], 'NA')

# rain_list = [arid, semi_arid, sub_humid, humid, boreal, na]

In [None]:
# for dataset in datasets:
#     dataset = region_col(dataset, rain_list, 'aez_class', 'rain_zone')