# Set up cohorts

Here we create an initial table of cohorts that could be included in the site, adding some useful data and filtering on the minimum cohort size.

In [None]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

config_file = "../../../config/agam.2025.03.05.yaml"

In [None]:
from selection_atlas.setup import AtlasSetup

# Initialise the atlas setup.
setup = AtlasSetup(config_file)

In [None]:
df_samples = setup.sample_metadata()

In [None]:
cohorts_col = "cohort_admin2_quarter"

In [None]:
def make_cohort_label(row):
    # N.B., not all cohorts have a quarter defined, because samples were not provided
    # with collection month in the metadata. In this case we expect to fall back to
    # year.
    if row.quarter > 0:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year} / Q{row.quarter}"
    else:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year}"

In [None]:
df_cohorts_selected = (
    df_samples
    # N.B., only include females, otherwise data on X contig will be wonky
    .query("sex_call == 'F'")
    .groupby(cohorts_col)
    .agg(
        {
            "sample_id": "count",
            "country": "first",
            "admin1_iso": "first",
            "admin1_name": "first",
            "admin2_name": "first",
            "taxon": "first",
            "year": "first",
            "quarter": "first",
        }
    )
    .reset_index()
    .rename(
        columns={
            "sample_id": "cohort_size",
            cohorts_col: "cohort_id",
        }
    )
    .query(f"cohort_size >= {setup.min_cohort_size}")
)
df_cohorts_selected["cohort_label"] = df_cohorts_selected.apply(
    make_cohort_label,
    axis="columns",
)
df_cohorts_selected["sample_query"] = df_cohorts_selected.apply(
    # N.B., only include females, otherwise data on X contig will be wonky
    lambda row: f"{cohorts_col} == '{row.cohort_id}' and sex_call == 'F'",
    axis="columns",
)
df_cohorts_selected

In [None]:
df_cohorts_selected.to_csv(setup.cohorts_file, index=False)