In [None]:
analysis_version = "dev"

# Obtain geographical data for cohorts

Here we obtain representative cohorts for administrative units and add to the cohorts dataframe.

In [None]:
import pandas as pd
from pyprojroot import here
import geopandas as gpd
import iso3166
import functools

In [None]:
# load cohorts to find sample query 
df_cohorts = pd.read_csv(here() / "build" / analysis_version / "final_cohorts.csv")
df_cohorts.head()

## Add ISO3166-1 3-letter country codes

We need 3-letter country codes in order to locate geoboundaries data.

In [None]:
countries = df_cohorts["country"].unique()
countries

In [None]:
len(countries)

In [None]:
# some countries known by a different name
countries_by_name = iso3166.countries_by_name
countries_by_name["Democratic Republic of the Congo".upper()] = iso3166.countries_by_alpha3["COD"]
countries_by_name["Cote d'Ivoire".upper()] = iso3166.countries_by_alpha3["CIV"]
countries_by_name["Gambia, The".upper()] = iso3166.countries_by_alpha3["GMB"]
countries_by_name["Tanzania".upper()] = iso3166.countries_by_alpha3["TZA"]

In [None]:
# add columns to the cohorts dataframe
df_cohorts["country_alpha2"] = df_cohorts.apply(
    lambda row: countries_by_name[row.country.upper()].alpha2,
    axis="columns"
)

df_cohorts["country_alpha3"] = df_cohorts.apply(
    lambda row: countries_by_name[row.country.upper()].alpha3,
    axis="columns"
)

In [None]:
df_cohorts[["country", "country_alpha2", "country_alpha3"]].head()

## Join with geoboundaries data

In [None]:
adm2_geojson_url_template = "https://github.com/wmgeolab/geoBoundaries/raw/v5.0.0/releaseData/gbOpen/{country_alpha3}/ADM2/geoBoundaries-{country_alpha3}-ADM2_simplified.geojson"

In [None]:
@functools.lru_cache(maxsize=None)
def read_adm2(country_alpha3):
    df = gpd.read_file(adm2_geojson_url_template.format(country_alpha3=country_alpha3))
    return df

In [None]:
def country_cohorts_geo(country_alpha3, alias=None):
    """Join cohorts dataframe with admin 2 unit geometry."""
    
    # read the geoboundaries data
    gdf_geo = read_adm2(country_alpha3)
    
    # subset cohorts to country
    df_coh = (
        df_cohorts
        .query(f"country_alpha3 == '{country_alpha3}'")
        .copy()
    )
    
    # deal with admin2 name aliases
    if alias is not None:
        df_coh["admin2_name"].replace(alias, inplace=True)
            
    # join dataframes
    gdf_ret = gpd.GeoDataFrame(
        df_coh.merge(
            gdf_geo, 
            left_on=["country_alpha3", "admin2_name"],
            right_on=["shapeGroup", "shapeName"],
            how="left",
        )
    )
    
    # confirm no missing data
    loc_missing = gdf_ret["shapeID"].isna()
    if loc_missing.any():
        print(
            "ERROR",
            country_alpha3,
            gdf_ret.iloc[loc_missing.values]["admin2_name"].unique()
        )
        
    # add representative point
    point = gdf_ret.representative_point()
    gdf_ret["representative_lon"] = point.x
    gdf_ret["representative_lat"] = point.y
    
    
    return gdf_ret

In [None]:
country_codes = df_cohorts["country_alpha3"].unique()
country_codes

In [None]:
# define aliases because cohorts were originally defined
# using geoboundaries v3, but here we are using v5
adm2_alias = {
    
    # CMR - Cameroon
    "Lom-Et-Djérem": "Lom-Et-Djerem",
    
    # GHA - Ghana
    "Twifu Heman/Lower Denkyira": "Twifo Hemang Lower Denkyira",
    # N.B., this is a hack, New Juaben has been split into two
    # districts. But in Ag3.0 we only have data from Koforidua
    # which is the capital of New Juaben South.
    "New Juaben": "New Juaben South Municipal",  
    # N.B., this is a hack, Shama Ahanta East has been split
    # into Shama and Sekondi Takoradi. In Ag3.0 we only have
    # data from Takoradi.
    "Shama Ahanta East": "Sekondi Takoradi Metropolis",
    
    # GMB - The Gambia
    # N.B., hack, in Ag3.0 location is Wali Kunda, checked on
    # map that it's in Lower Fuladu West.
    "Fulladu West": "Lower Fuladu West",
    "Central Baddibu": "Central Badibu",
    
    # GNB - Guinea-Bissau
    "Setor De Safim": "Setor de Safim",
    
    # KEN - Kenya
    # N.B., hack, in Ag.30 checked coordinates, location is
    # in Kilifi North
    "Kilifi": "Kilifi North",
    
}

gdf_cohorts = pd.concat(
    [country_cohorts_geo(x, alias=adm2_alias)
     for x in country_codes]
)
gdf_cohorts.head()

In [None]:
gdf_cohorts.to_file(here() / "build" / analysis_version / "final_cohorts.geojson")

## Check

Check the file reads back in.

In [None]:
gdf_check = gpd.read_file(here() / "build" / analysis_version / "final_cohorts.geojson").head()
gdf_check.head()

## Debug

Code cells below were used to debug the aliases required to join admin2 units.

In [None]:
# import malariagen_data
# ag3 = malariagen_data.Ag3()

In [None]:
# (
#     ag3
#     .sample_metadata()
#     .query("admin2_name == 'Kilifi'")
#     .groupby(["location", "latitude", "longitude"])
#     .size()
# )

In [None]:
# read_adm2("KEN").explore()

In [None]:
# with pd.option_context("display.max_rows", None):
#     display(read_adm2("KEN"))