In [42]:
# Useful library imports
import pandas as pd
import geopandas as gpd
import folium
import os
import zipfile
from shapely import *
import numpy as np
import fiona


# functions to process PeopleGroups datasets
from features import *

<hr style="border: 5px solid #003262;" />
<hr style="border: 1px solid #fdb515;" />

# ** Note ** 
# Please ignore any and all code below. Still a work in progress.

<hr style="border: 5px solid #003262;" />
<hr style="border: 1px solid #fdb515;" />

# Revising Method

In [None]:
morocco_ppg = pd.read_excel('./morocco_data/morocco-ppg.xlsx')
# Rename columns in order to use the convert_to_geodataframe function
morocco_ppg.rename(columns={'IMB_Affinity': 'IMB Affinity Group', 
                            'Affinity Bloc': 'Affinity Bloc', 
                            'People_Group': 'People Group', 
                            'People_Name':'People Name'}, inplace=True)

In [None]:
morocco_ppg

In [None]:
morocco_ppg_areas[morocco_ppg_areas['Ctry'] == 'Morocco']

In [None]:
moroccan_ppgs = morocco_ppg.merge(morocco_ppg_areas, how='left', left_on='People Group', right_on='NmDisp')
moroccan_ppgs

In [None]:
shapes = gpd.read_file('./morocco_data/geodata/geoBoundaries-MAR-ADM1.shp')
shapes.head()

def adm1(ppg_df, shapes, adm1_name):
    '''
    Parameter(s): DataFrame (People groups data frame. Should be the cleaned result of the find_ppg_data function.
    Parameter(s): GeoDataFrame (GeoDataFrame from adm1 shapefile)
    Parameter(s): String (Name of the administrative level 1 subdivision name - e.g. Province)
    Note: PeopleGroups points that fall outside of an administrative region have their ADM1 subdivision encoded as "MISSING".
    These groups need to have their information manually filled in.
    '''

    subdivisions = []
    for i in range(len(ppg_df.index)):
        # Create a coordinate based on a People Group's latitude and longitude
        curr_polygon = ppg_df['geometry'].iloc[i]
        grouped_subdivisions = shapes[shapes['geometry'].overlaps(curr_polygon) == True].index.values
        
        if len(grouped_subdivisions) == 0:
            grouped_subdivisions = ['MISSING']
        subdivisions.append(grouped_subdivisions[0])
    matches = [shapes['shapeName'].iloc[i] for i in subdivisions]
    ppg_df[adm1_name] = matches
    print(f"Please check the generated {adm1_name} column for missing values, encoded as 'MISSING'")
    print(f"There are {sum(ppg_df[adm1_name] == 'MISSING')} unassigned people groups.")
    return ppg_df

adm1(moroccan_ppgs, shapes, "Region")

In [None]:
morocco_gdf = gpd.GeoDataFrame(moroccan_ppgs, geometry='geometry')

In [None]:
morocco_gdf.explore()

In [None]:
t = morocco_gdf.loc[morocco_gdf['People Group'] == "Saharawi", 'geometry'].iloc[0]

In [None]:
type(t)

In [None]:
morocco_gdf;
shapes;

`morocco_gdf CRS`: 3857  
`shapes CRS`: 4326

In [None]:
# change crs of morocco_gdf to 4326

g = morocco_gdf.to_crs(4326)

In [None]:
t = shapes.explore(
    color='red'
)

In [None]:
g.explore(
    m=t
)

In [None]:
tachelhit = g.iloc[2]['geometry']

In [None]:
shapes.geometry.overlaps(tachelhit)

In [None]:
shapes.iloc[[3,4,5,7,11]]

In [None]:
riffis = g.iloc[1]['geometry']

In [None]:
g

In [None]:
#  FIRST WORKING TEST

morocco_gdf;
shapes;
# morocco_gdf CRS: 3857  
# shapes CRS: 4326

g = morocco_gdf.to_crs(4326)
# do a test of the tachelhit people group 
tachelhit = g.iloc[2]['geometry']
# shapes where tachelhit overlaps the geometry column
k =  shapes.geometry.overlaps(tachelhit)
# select those boundaries


In [None]:
morocco_gdf.crs

In [None]:
def find_all_adm1(ppg_gdf, shapes, adm1_name):
    # arbitrarily chose Coordinate Reference System (CRS) to be 4326
    ppg_gdf = ppg_gdf.to_crs(4326) 
    shapes = shapes.to_crs(4326)
    boundaries = []
    
    for people_polygon in ppg_gdf.geometry:
        # true false series
        overlapping_polygons = shapes.geometry.overlaps(people_polygon)
        
        # from stack overflow - select series indices based on True values
        indices = overlapping_polygons[overlapping_polygons].index.values
        
        # select the names of the boundaries
        all_boundaries_found = shapes.iloc[indices].shapeName.tolist()
        
        if len(all_boundaries_found) == 0:
            boundaries.append('NONE')
        else:
            # stored in a string to make it look nice in the DataFrame
            boundaries_string = ', '.join(all_boundaries_found)
            boundaries.append(boundaries_string)
            
    ppg_gdf[f'{adm1_name}s Present'] = boundaries
    # filter out the NONE values
    return ppg_gdf[ppg_gdf[f'{adm1_name}s Present'] != 'NONE']

In [None]:
y = find_all_adm1(morocco_gdf, shapes, 'Region')
y

---

In [2]:
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

wikiurl = 'https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes'
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)

soup = BeautifulSoup(response.text, 'html.parser')
indiatable = soup.find('table',{'class':"wikitable"})

iso_country_codes = pd.read_html(str(indiatable))

# convert list to dataframe
iso_country_codes = pd.DataFrame(iso_country_codes[0]).droplevel(0, axis=1).rename({'Country name[5]':'Country', 'Alpha-3 code[5]':'Alpha 3 code'}, axis=1)

iso_country_codes = iso_country_codes[['Country', 'Alpha 3 code']]

In [None]:
iso_country_codes

In [None]:
cgaz = gpd.read_file('./cgaz/geoBoundariesCGAZ_ADM1.shp', encoding='utf-8')

cgaz_named = cgaz.merge(iso_country_codes, left_on='shapeGroup', right_on='Alpha 3 code', how='left').drop(['LEVEL', 'Alpha 3 code'], axis=1).drop([3209, 3210], axis=0)

In [None]:
cgaz_named.to_csv('cgaz_named.csv')

---

<hr style="border: 5px solid #003262;" />
<hr style="border: 1px solid #fdb515;" />

# Connecting to MongoDB Cluster

In [5]:
from pymongo import MongoClient

In [6]:
# function for reading collections from mongodb
def read_database(db_name, collection_name):
    """
    reads a collection from a mongodb database and returns a pandas dataframe.
    must specify which collection you want to read from a database.
    must already have established connection to cluster -- called client
    """
    # connect to a database within a cluster
    db = client.get_database(db_name)
    
    # connect to a collection within a database
    collection = db[collection_name]
    
    all_records = collection.find()
    list_cursor = list(all_records)
    
    return pd.DataFrame(list_cursor)

In [11]:
# username: vsfsintern
# password: indianhealthservice

# establish a connection
client = MongoClient('mongodb+srv://vsfsintern:indianhealthservice@ihs.ayph1rs.mongodb.net/test')


subnational_boundaries = read_database('dataValidation', 'subnationalBoundaries')
subnational_population = read_database('dataValidation', 'subnationalPopulation')
people_points = read_database('peopleGroups', 'peoplePoints')
people_areas = read_database('peopleGroups', 'peopleAreas')

In [13]:
people_areas = gpd.read_file('./people_areas.gdb')

Unnamed: 0,PEID,Name,NmDisp,NmAlt,GENC0,Ctry,Regn,RegnSub,ROG,AffCd,...,Photo,LvlBible,YrPub,Conf,Contrib,Src,areaSqKm,SHAPE_Length,SHAPE_Area,geometry
0,42895,Adamawa Fulani,Adamawa Fulani,"Fellata, Bagirmi Fulani",CMR,Cameroon,2,17,CM,AG300,...,Y,1,2018.0,2,Jim Courson,ESRI admin / WLMS 10 (fub),51979.61,1939687.0,53389100000.0,"MULTIPOLYGON (((1475282.684 830493.826, 148321..."
1,41289,Balmiki (Awadhi),Balmiki (Awadhi),"Balmiki, Bhangi, Chuhra, Chura, Chuhre, Valmik...",IND,India,142,34,IN,AG500,...,N,1,2005.0,2,Jim Courson,Spatial Join PEID to gadm36_2,3865.275,494250.4,4970038000.0,"MULTIPOLYGON (((8686675.887 3222330.931, 86866..."
2,6802,"Han Chinese, Mandarin",Han Chinese (Mandarin),,CHN,China,142,30,CH,AG650,...,Y,1,2018.0,3,Jim Courson,ESRI admin / WLMS 10 (cmn) / China Census 2010,4160287.0,56842380.0,6793303000000.0,"MULTIPOLYGON (((13267899.829 4121726.660, 1326..."
3,43366,Canadian,Canadians,,CAN,Canada,19,21,CA,AG100,...,Y,1,2020.0,2,Jim Courson,GADM adm2 PopDensity Dissolve Simplify 500 m,1338429.0,73979680.0,3121534000000.0,"MULTIPOLYGON (((-13595040.591 6305199.857, -13..."
4,17324,Argentine,Argentines,,ARG,Argentina,19,5,AR,AG800,...,Y,1,2017.0,3,Jim Courson,ESRI admin - Argentina SimPoly250m,2778565.0,24749930.0,4310571000000.0,"MULTIPOLYGON (((-7279640.212 -5557825.548, -72..."


In [17]:
topo = gpd.read_file('https://github.com/wmgeolab/geoBoundaries/raw/main/releaseData/CGAZ/geoBoundariesCGAZ_ADM1.geojson')

In [20]:
g = gpd.read_file('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/cgaz.topojson')

In [21]:
g.head()

Unnamed: 0,id,ISO_CODE,shapeName,LEVEL,Level,shapeID,shapeGroup,shapeType,geometry
0,,,,ADM1,,AFG-ADM1-8247537B41623266,AFG,ADM1,"POLYGON ((71.19492 36.03916, 71.17004 36.02439..."
1,,,,ADM1,,AFG-ADM1-8247537B78184625,AFG,ADM1,"POLYGON ((64.70638 35.21291, 64.72793 35.18542..."
2,,,,ADM1,,AFG-ADM1-8247537B74530288,AFG,ADM1,"POLYGON ((69.21919 36.30379, 69.21190 36.28759..."
3,,,,ADM1,,AFG-ADM1-8247537B30575277,AFG,ADM1,"POLYGON ((68.18924 36.56585, 68.18498 36.54288..."
4,,,,ADM1,,AFG-ADM1-8247537B29636249,AFG,ADM1,"POLYGON ((68.08847 35.46823, 68.08538 35.45314..."


In [25]:
sam = g[g['shapeGroup'] == 'WSM']

In [26]:
sam.explore()

<hr style="border: 5px solid #003262;" />
<hr style="border: 1px solid #fdb515;" />

# Loading data remotely

In [32]:
subnational_boundaries = gpd.read_file('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/cgaz.topojson')

In [34]:
subnational_population = pd.read_csv('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/subnationalPopulation.csv')

In [39]:
people_points = pd.read_excel('https://github.com/andrewjoc/ihs/raw/main/people_groups_verification/data/People_Groups.xlsx')

### people_areas is from ArcGIS -> ~130 MB which is too large for github
### attemped to use LFS but it seems like it doesn't work. interested user must download data from https://go-imb.opendata.arcgis.com/datasets/imb::apg-people-group-areas/explore?location=43.103723%2C70.948650%2C4.70

In [46]:
people_areas = gpd.read_file('data/')

In [47]:
people_areas

Unnamed: 0,PEID,Name,NmDisp,NmAlt,GENC0,Ctry,Regn,RegnSub,ROG,AffCd,...,Photo,LvlBible,YrPub,Conf,Contrib,Src,areaSqKm,SHAPE_Length,SHAPE_Area,geometry
0,42895,Adamawa Fulani,Adamawa Fulani,"Fellata, Bagirmi Fulani",CMR,Cameroon,002,017,CM,AG300,...,Y,1,2018.0,2,Jim Courson,ESRI admin / WLMS 10 (fub),5.197961e+04,1.939687e+06,5.338910e+10,"MULTIPOLYGON (((1475282.684 830493.826, 148321..."
1,41289,Balmiki (Awadhi),Balmiki (Awadhi),"Balmiki, Bhangi, Chuhra, Chura, Chuhre, Valmik...",IND,India,142,034,IN,AG500,...,N,1,2005.0,2,Jim Courson,Spatial Join PEID to gadm36_2,3.865275e+03,4.942504e+05,4.970038e+09,"MULTIPOLYGON (((8686675.887 3222330.931, 86866..."
2,6802,"Han Chinese, Mandarin",Han Chinese (Mandarin),,CHN,China,142,030,CH,AG650,...,Y,1,2018.0,3,Jim Courson,ESRI admin / WLMS 10 (cmn) / China Census 2010,4.160287e+06,5.684238e+07,6.793303e+12,"MULTIPOLYGON (((13267899.829 4121726.660, 1326..."
3,43366,Canadian,Canadians,,CAN,Canada,019,021,CA,AG100,...,Y,1,2020.0,2,Jim Courson,GADM adm2 PopDensity Dissolve Simplify 500 m,1.338429e+06,7.397968e+07,3.121534e+12,"MULTIPOLYGON (((-13595040.591 6305199.857, -13..."
4,17324,Argentine,Argentines,,ARG,Argentina,019,005,AR,AG800,...,Y,1,2017.0,3,Jim Courson,ESRI admin - Argentina SimPoly250m,2.778565e+06,2.474993e+07,4.310571e+12,"MULTIPOLYGON (((-7279640.212 -5557825.548, -72..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12011,50310,Part-Indian,Part-Indian,,PAN,Panama,019,013,PM,AG800,...,Y,1,2017.0,2,Jim Courson,aPrvn5m - Colon / Panama / Embera-Wounaan,1.767374e+04,2.284774e+06,1.822423e+10,"MULTIPOLYGON (((-8895808.824 1045747.010, -889..."
12012,50309,American,Americans,,PAN,Panama,019,013,PM,AG800,...,N,1,2020.0,2,Jim Courson,aPrvn5m - Colon / Panama / Panama Oeste,1.599098e+04,2.355586e+06,1.650267e+10,"MULTIPOLYGON (((-8895808.824 1045747.010, -889..."
12013,101,Dane,Danes,,FRO,Faroe Islands,150,154,FO,AG100,...,Y,1,2015.0,2,Jim Courson,Freehand / ESRI admin - Torshavn area,1.944459e+01,4.832500e+04,-8.800084e+07,"MULTIPOLYGON (((-757420.116 8871467.102, -7579..."
12014,15470,French,French,Metropolitan,REU,Reunion,002,014,RE,AG100,...,Y,1,2020.0,2,Jim Courson,ESRI admin - Reunion,2.513119e+03,2.290045e+05,2.903034e+09,"MULTIPOLYGON (((6173060.568 -2376729.560, 6173..."
