# Set up cohorts

Here we create an initial table of cohorts that could be included in the site, adding some useful data and filtering on the minimum cohort size.

In [1]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

config_file = "../../../config/agam.yaml"

In [2]:
from selection_atlas.setup import AtlasSetup

# Initialise the atlas setup.
setup = AtlasSetup(config_file)

In [3]:
df_samples = setup.sample_metadata()

                                     

In [4]:
cohorts_col = "cohort_admin2_quarter"

In [5]:
def make_cohort_label(row):
    # N.B., not all cohorts have a quarter defined, because samples were not provided
    # with collection month in the metadata. In this case we expect to fall back to
    # year.
    if row.quarter > 0:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year} / Q{row.quarter}"
    else:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year}"

In [6]:
df_cohorts_selected = (
    df_samples
    # N.B., only include females, otherwise data on X contig will be wonky
    .query("sex_call == 'F'")
    .groupby(cohorts_col)
    .agg(
        {
            "sample_id": "count",
            "country": "first",
            "admin1_iso": "first",
            "admin1_name": "first",
            "admin2_name": "first",
            "taxon": "first",
            "year": "first",
            "quarter": "first",
        }
    )
    .reset_index()
    .rename(
        columns={
            "sample_id": "cohort_size",
            cohorts_col: "cohort_id",
        }
    )
    .query(f"cohort_size >= {setup.min_cohort_size}")
)
df_cohorts_selected["cohort_label"] = df_cohorts_selected.apply(
    make_cohort_label,
    axis="columns",
)
df_cohorts_selected["sample_query"] = df_cohorts_selected.apply(
    # N.B., only include females, otherwise data on X contig will be wonky
    lambda row: f"{cohorts_col} == '{row.cohort_id}' and sex_call == 'F'",
    axis="columns",
)
df_cohorts_selected

Unnamed: 0,cohort_id,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query
0,AO-LUA_Luanda_colu_2009_Q2,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,2,Angola / Luanda / coluzzii / 2009 / Q2,cohort_admin2_quarter == 'AO-LUA_Luanda_colu_2...
4,BF-02_Comoe_colu_2011,18,Burkina Faso,BF-02,Cascades,Comoe,coluzzii,2011,-1,Burkina Faso / Comoe / coluzzii / 2011,cohort_admin2_quarter == 'BF-02_Comoe_colu_201...
5,BF-02_Comoe_colu_2012,63,Burkina Faso,BF-02,Cascades,Comoe,coluzzii,2012,-1,Burkina Faso / Comoe / coluzzii / 2012,cohort_admin2_quarter == 'BF-02_Comoe_colu_201...
6,BF-02_Comoe_colu_2015,33,Burkina Faso,BF-02,Cascades,Comoe,coluzzii,2015,-1,Burkina Faso / Comoe / coluzzii / 2015,cohort_admin2_quarter == 'BF-02_Comoe_colu_201...
7,BF-02_Comoe_colu_2016,53,Burkina Faso,BF-02,Cascades,Comoe,coluzzii,2016,-1,Burkina Faso / Comoe / coluzzii / 2016,cohort_admin2_quarter == 'BF-02_Comoe_colu_201...
...,...,...,...,...,...,...,...,...,...,...,...
196,UG-E_Busia_gamb_2016_Q2,24,Uganda,UG-E,Eastern Region,Busia,gambiae,2016,2,Uganda / Busia / gambiae / 2016 / Q2,cohort_admin2_quarter == 'UG-E_Busia_gamb_2016...
199,UG-E_Mayuge_gamb_2017_Q2,21,Uganda,UG-E,Eastern Region,Mayuge,gambiae,2017,2,Uganda / Mayuge / gambiae / 2017 / Q2,cohort_admin2_quarter == 'UG-E_Mayuge_gamb_201...
200,UG-E_Tororo_arab_2012_Q4,81,Uganda,UG-E,Eastern Region,Tororo,arabiensis,2012,4,Uganda / Tororo / arabiensis / 2012 / Q4,cohort_admin2_quarter == 'UG-E_Tororo_arab_201...
201,UG-E_Tororo_gamb_2012_Q4,112,Uganda,UG-E,Eastern Region,Tororo,gambiae,2012,4,Uganda / Tororo / gambiae / 2012 / Q4,cohort_admin2_quarter == 'UG-E_Tororo_gamb_201...


In [8]:
df_cohorts_selected.to_csv(setup.cohorts_file, index=False)