In [2]:
# Useful library imports
import pandas as pd
import geopandas as gpd
import folium
import os
import zipfile
from shapely import *
import numpy as np

# functions to process PeopleGroups datasets
from features import *

<hr style="border: 5px solid #003262;" />
<hr style="border: 1px solid #fdb515;" />

# Load Data

def validate_ppg(ppg_areas_gdf, country, adm1_name):
    """
    Determines if people group's has accurate population counts in a country.
    1: GeoDataFrame
    2: String
    3: String
    """
    
    # arbitrarily change Coordinate Reference System (CRS) to be the same (4326)
    ppg_areas_gdf = ppg_areas_gdf.to_crs(4326) 
    
    # LOADING DATA -- find data for a country and load shapes
    ppg_df = find_ppg_data(country)
    shapes = gpd.read_file('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/cgaz.topojson').set_crs("EPSG:4326")
    populations = pd.read_csv('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/subnationalPopulation.csv')
    
    # MERGE -- merge people points with people areas
    full_people_df = ppg_df.merge(ppg_areas_gdf, how='left', left_on='People Group', right_on='NmDisp')
    
    # MERGE -- merge full df with subnational population data
    # DO LATER!!!
    
   
    boundaries = []
    for people_polygon in ppg_gdf.geometry:
        # true false series
        overlapping_polygons = shapes.geometry.overlaps(people_polygon)
        
        # from stack overflow - select series indices based on True values
        indices = overlapping_polygons[overlapping_polygons].index.values
        
        # select the names of the boundaries
        all_boundaries_found = shapes.iloc[indices].shapeName.tolist()
        
        if len(all_boundaries_found) == 0:
            boundaries.append('NONE')
        else:
            # stored in a string to make it look nice in the DataFrame
            boundaries_string = ', '.join(all_boundaries_found)
            boundaries.append(boundaries_string)
            
    ppg_gdf[f'{adm1_name}s Present'] = boundaries
    # filter out the NONE values
    
    
    return ppg_gdf[ppg_gdf[f'{adm1_name}s Present'] != 'NONE']

cell below takes a minute or 2 to load

In [2]:
boundaries = gpd.read_file('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/cgaz.topojson')

populations = pd.read_csv('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/subnationalPopulation.csv')

people_points = pd.read_excel('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/People_Groups.xlsx')

people_areas = gpd.read_file('data/people_areas.geojson')

In [3]:
validate_ppg(people_areas, country="Indonesia", adm1_name="Regency")

TypeError: (<class 'geopandas.geoseries.GeoSeries'>, <class 'NoneType'>)

<br>

<hr style="border: 5px solid #003262;" />
<hr style="border: 1px solid #fdb515;" />

# Validation: Southeast Asia

## Countries 
* Brunei
* Cambodia
* Indonesia
* Laos
* Malaysia
* Myanmar
* Philippines
* Singapore
* Thailand
* Timor-Leste
* Vietnam

In [None]:
# arbitrarily change Coordinate Reference System (CRS) to be the same (4326)
people_areas = people_areas.to_crs(4326) 
    
# LOADING DATA -- find data for a country and load shapes
ppg_df = find_ppg_data("Indonesia")
shapes = gpd.read_file('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/cgaz.topojson').set_crs("EPSG:4326")
populations = pd.read_csv('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/subnationalPopulation.csv')    
    
    # MERGE -- merge people points with people areas
full_people_df = ppg_df.merge(people_areas, how='left', left_on='People Group', right_on='NmDisp')
full_people_df = gpd.GeoDataFrame(full_people_df, crs=4326)
full_people_df.dropna(subset=['geometry'], inplace=True)
    
# MERGE -- merge full df with subnational population data
# DO LATER!!!

boundaries = []
for people_polygon in full_people_df.geometry:
    # true false series
    overlapping_polygons = shapes.geometry.overlaps(people_polygon)
        
    # from stack overflow - select series indices based on True values
    indices = overlapping_polygons[overlapping_polygons].index.values
        
    # select the names of the boundaries
    all_boundaries_found = shapes.iloc[indices].shapeName.tolist()
        
    if len(all_boundaries_found) == 0:
        boundaries.append('NONE')
    else:
        # stored in a string to make it look nice in the DataFrame
        boundaries_string = ','.join(all_boundaries_found)
        boundaries.append(boundaries_string)

        
full_people_df[f'ADM1 overlap'] = boundaries
    # filter out the NONE values

NameError: name 'people_areas' is not defined

In [49]:
shapes.head()

Unnamed: 0,id,ISO_CODE,shapeName,LEVEL,Level,shapeID,shapeGroup,shapeType,geometry
0,,,,ADM1,,AFG-ADM1-8247537B41623266,AFG,ADM1,"POLYGON ((71.19492 36.03916, 71.17004 36.02439..."
1,,,,ADM1,,AFG-ADM1-8247537B78184625,AFG,ADM1,"POLYGON ((64.70638 35.21291, 64.72793 35.18542..."
2,,,,ADM1,,AFG-ADM1-8247537B74530288,AFG,ADM1,"POLYGON ((69.21919 36.30379, 69.21190 36.28759..."
3,,,,ADM1,,AFG-ADM1-8247537B30575277,AFG,ADM1,"POLYGON ((68.18924 36.56585, 68.18498 36.54288..."
4,,,,ADM1,,AFG-ADM1-8247537B29636249,AFG,ADM1,"POLYGON ((68.08847 35.46823, 68.08538 35.45314..."


In [50]:
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

wikiurl = 'https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes'
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)

soup = BeautifulSoup(response.text, 'html.parser')
indiatable = soup.find('table',{'class':"wikitable"})

iso_country_codes = pd.read_html(str(indiatable))

# convert list to dataframe
iso_country_codes = pd.DataFrame(iso_country_codes[0]).droplevel(0, axis=1).rename({'Country name[5]':'Country', 'Alpha-3 code[5]':'Alpha 3 code'}, axis=1)

iso_country_codes = iso_country_codes[['Country', 'Alpha 3 code']]

In [59]:
shapes.merge(iso_country_codes, left_on='shapeGroup', right_on='Alpha 3 code', how='left').drop(['LEVEL', 'Alpha 3 code'], axis=1).dropna(subset='Country').drop('geometry', axis=1)

Unnamed: 0,id,ISO_CODE,shapeName,Level,shapeID,shapeGroup,shapeType,Country
0,,,,,AFG-ADM1-8247537B41623266,AFG,ADM1,Afghanistan
1,,,,,AFG-ADM1-8247537B78184625,AFG,ADM1,Afghanistan
2,,,,,AFG-ADM1-8247537B74530288,AFG,ADM1,Afghanistan
3,,,,,AFG-ADM1-8247537B30575277,AFG,ADM1,Afghanistan
4,,,,,AFG-ADM1-8247537B29636249,AFG,ADM1,Afghanistan
...,...,...,...,...,...,...,...,...
3204,,,Fct,,NGA-ADM1-2878026B56994386,NGA,ADM1,Nigeria
3205,,,Bayelsa,,NGA-ADM1-2878026B52218734,NGA,ADM1,Nigeria
3206,,,Enugu,,NGA-ADM1-2878026B43454599,NGA,ADM1,Nigeria
3207,,,Lagos,,NGA-ADM1-2878026B82395638,NGA,ADM1,Nigeria
