# Obtain geographical data for cohorts

Here we obtain representative cohorts for administrative units and add to the cohorts dataframe.

In [3]:
import pandas as pd
from pyprojroot import here
import geopandas as gpd
import iso3166
import functools

In [4]:
# load cohorts to find sample query 
df_cohorts = pd.read_csv(here() / "build" / "cohorts.csv").set_index("cohort_id")
df_cohorts.head()

Unnamed: 0_level_0,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AO-LUA_Luanda_colu_2009_Q2,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,2,Angola / Luanda / coluzzii / 2009 / Q2,cohort_admin2_quarter == 'AO-LUA_Luanda_colu_2...
BF-09_Houet_colu_2012_Q3,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,3,Burkina Faso / Houet / coluzzii / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
BF-09_Houet_colu_2014_Q3,32,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,3,Burkina Faso / Houet / coluzzii / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
BF-09_Houet_gamb_2012_Q3,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,3,Burkina Faso / Houet / gambiae / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...
BF-09_Houet_gamb_2014_Q3,41,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,3,Burkina Faso / Houet / gambiae / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...


## Add ISO3166-1 3-letter country codes

We need 3-letter country codes in order to locate geoboundaries data.

In [5]:
countries = df_cohorts["country"].unique()
countries

array(['Angola', 'Burkina Faso', 'Democratic Republic of the Congo',
       'Central African Republic', "Cote d'Ivoire", 'Cameroon', 'Gabon',
       'Ghana', 'Gambia, The', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Mali',
       'Malawi', 'Mozambique', 'Tanzania', 'Uganda'], dtype=object)

In [6]:
len(countries)

17

In [7]:
# some countries known by a different name
countries_by_name = iso3166.countries_by_name
countries_by_name["Democratic Republic of the Congo".upper()] = iso3166.countries_by_alpha3["COD"]
countries_by_name["Cote d'Ivoire".upper()] = iso3166.countries_by_alpha3["CIV"]
countries_by_name["Gambia, The".upper()] = iso3166.countries_by_alpha3["GMB"]
countries_by_name["Tanzania".upper()] = iso3166.countries_by_alpha3["TZA"]

In [8]:
# add a column to the cohorts dataframe
df_cohorts["country_alpha3"] = df_cohorts.apply(
    lambda row: countries_by_name[row.country.upper()].alpha3,
    axis="columns"
)

In [9]:
df_cohorts[["country", "country_alpha3"]].head()

Unnamed: 0_level_0,country,country_alpha3
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AO-LUA_Luanda_colu_2009_Q2,Angola,AGO
BF-09_Houet_colu_2012_Q3,Burkina Faso,BFA
BF-09_Houet_colu_2014_Q3,Burkina Faso,BFA
BF-09_Houet_gamb_2012_Q3,Burkina Faso,BFA
BF-09_Houet_gamb_2014_Q3,Burkina Faso,BFA


## Join with geoboundaries data

In [10]:
adm2_geojson_url_template = "https://github.com/wmgeolab/geoBoundaries/raw/v5.0.0/releaseData/gbOpen/{country_alpha3}/ADM2/geoBoundaries-{country_alpha3}-ADM2_simplified.geojson"

In [11]:
@functools.lru_cache(maxsize=None)
def read_adm2(country_alpha3):
    df = gpd.read_file(adm2_geojson_url_template.format(country_alpha3=country_alpha3))
    return df

In [12]:
def country_cohorts_geo(country_alpha3, alias=None):
    """Join cohorts dataframe with admin 2 unit geometry."""
    
    # read the geoboundaries data
    gdf_geo = read_adm2(country_alpha3)
    
    # subset cohorts to country
    df_coh = (
        df_cohorts
        .query(f"country_alpha3 == '{country_alpha3}'")
        .copy()
    )
    
    # deal with admin2 name aliases
    if alias is not None:
        df_coh["admin2_name"].replace(alias, inplace=True)
            
    # join dataframes
    gdf_ret = gpd.GeoDataFrame(
        df_coh.merge(
            gdf_geo, 
            left_on=["country_alpha3", "admin2_name"],
            right_on=["shapeGroup", "shapeName"],
            how="left",
        )
    )
    
    # confirm no missing data
    loc_missing = gdf_ret["shapeID"].isna()
    if loc_missing.any():
        print(
            "ERROR",
            country_alpha3,
            gdf_ret.iloc[loc_missing.values]["admin2_name"].unique()
        )
        
    # add representative point
    point = gdf_ret.representative_point()
    gdf_ret["representative_lon"] = point.x
    gdf_ret["representative_lat"] = point.y
    
    
    return gdf_ret

In [13]:
country_codes = df_cohorts["country_alpha3"].unique()
country_codes

array(['AGO', 'BFA', 'COD', 'CAF', 'CIV', 'CMR', 'GAB', 'GHA', 'GMB',
       'GIN', 'GNB', 'KEN', 'MLI', 'MWI', 'MOZ', 'TZA', 'UGA'],
      dtype=object)

In [14]:
# define aliases because cohorts were originally defined
# using geoboundaries v3, but here we are using v5
adm2_alias = {
    
    # CMR - Cameroon
    "Lom-Et-Djérem": "Lom-Et-Djerem",
    
    # GHA - Ghana
    "Twifu Heman/Lower Denkyira": "Twifo Hemang Lower Denkyira",
    # N.B., this is a hack, New Juaben has been split into two
    # districts. But in Ag3.0 we only have data from Koforidua
    # which is the capital of New Juaben South.
    "New Juaben": "New Juaben South Municipal",  
    # N.B., this is a hack, Shama Ahanta East has been split
    # into Shama and Sekondi Takoradi. In Ag3.0 we only have
    # data from Takoradi.
    "Shama Ahanta East": "Sekondi Takoradi Metropolis",
    
    # GMB - The Gambia
    # N.B., hack, in Ag3.0 location is Wali Kunda, checked on
    # map that it's in Lower Fuladu West.
    "Fulladu West": "Lower Fuladu West",
    "Central Baddibu": "Central Badibu",
    
    # GNB - Guinea-Bissau
    "Setor De Safim": "Setor de Safim",
    
    # KEN - Kenya
    # N.B., hack, in Ag.30 checked coordinates, location is
    # in Kilifi North
    "Kilifi": "Kilifi North",
    
}

gdf_cohorts = pd.concat(
    [country_cohorts_geo(x, alias=adm2_alias)
     for x in country_codes]
)
gdf_cohorts.head()

Unnamed: 0,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query,country_alpha3,shapeName,shapeISO,shapeID,shapeGroup,shapeType,geometry,representative_lon,representative_lat
0,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,2,Angola / Luanda / coluzzii / 2009 / Q2,cohort_admin2_quarter == 'AO-LUA_Luanda_colu_2...,AGO,Luanda,,91424787B29298874255409,AGO,ADM2,"POLYGON ((13.26098 -8.91283, 13.26098 -8.91133...",13.263677,-8.83307
0,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,3,Burkina Faso / Houet / coluzzii / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960...",-4.311852,11.389151
1,32,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,3,Burkina Faso / Houet / coluzzii / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960...",-4.311852,11.389151
2,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,3,Burkina Faso / Houet / gambiae / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960...",-4.311852,11.389151
3,41,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,3,Burkina Faso / Houet / gambiae / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960...",-4.311852,11.389151


In [16]:
gdf_cohorts.to_file(here() / "build" / "cohorts.geojson")

## Check

Check the file reads back in.

In [17]:
gdf_check = gpd.read_file(here() / "build" / "cohorts.geojson").head()
gdf_check.head()

Unnamed: 0,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query,country_alpha3,shapeName,shapeISO,shapeID,shapeGroup,shapeType,representative_lon,representative_lat,geometry
0,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,2,Angola / Luanda / coluzzii / 2009 / Q2,cohort_admin2_quarter == 'AO-LUA_Luanda_colu_2...,AGO,Luanda,,91424787B29298874255409,AGO,ADM2,13.263677,-8.83307,"POLYGON ((13.26098 -8.91283, 13.26098 -8.91133..."
1,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,3,Burkina Faso / Houet / coluzzii / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,-4.311852,11.389151,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960..."
2,32,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,3,Burkina Faso / Houet / coluzzii / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,-4.311852,11.389151,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960..."
3,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,3,Burkina Faso / Houet / gambiae / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,-4.311852,11.389151,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960..."
4,41,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,3,Burkina Faso / Houet / gambiae / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...,BFA,Houet,,67063806B43041068804188,BFA,ADM2,-4.311852,11.389151,"POLYGON ((-3.97513 11.93914, -4.00000 11.93960..."


## Debug

Code cells below were used to debug the aliases required to join admin2 units.

In [160]:
# import malariagen_data
# ag3 = malariagen_data.Ag3()

In [161]:
# (
#     ag3
#     .sample_metadata()
#     .query("admin2_name == 'Kilifi'")
#     .groupby(["location", "latitude", "longitude"])
#     .size()
# )

In [162]:
# read_adm2("KEN").explore()

In [163]:
# with pd.option_context("display.max_rows", None):
#     display(read_adm2("KEN"))