In [17]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np

In [2]:
with open("workflow/config.yaml") as config_file:
    config = yaml.safe_load(config_file)
config

{'min_cohort_size': 20,
 'max_cohort_size': 50,
 'ag': {'sample_sets': ['AG1000G-BF-A'],
  'contigs': ['2R'],
  'cohorts_analysis': '20230223',
  'h12_calibration_contig': '3L'}}

In [3]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    # url = "gs://vo_agam_release",
    url="simplecache::gs://vo_agam_release",
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=config["ag"]["cohorts_analysis"],
    # TODO remove simplecache config in production
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/sanj/projects/selection-atlas/malariagen_data_cache
Cohorts analysis,20230223
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.0.0.post80+85511cd
Client location,"England, GB"


In [4]:
df_samples = ag3.sample_metadata(sample_sets=config["ag"]["sample_sets"])

Load sample metadata:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
def month_to_quarter(row):
    return ((row.month - 1) // 3) + 1 if row.month > 0 else -1

In [6]:
# add a "quarter" column for convenience
df_samples["quarter"] = df_samples.apply(
    month_to_quarter,
    axis="columns"
)
df_samples

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,...,admin1_iso,admin2_name,taxon,cohort_admin1_year,cohort_admin1_month,cohort_admin1_quarter,cohort_admin2_year,cohort_admin2_month,cohort_admin2_quarter,quarter
0,AB0085-Cx,BF2-4,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F,...,BF-09,Houet,gambiae,BF-09_gamb_2012,BF-09_gamb_2012_07,BF-09_gamb_2012_Q3,BF-09_Houet_gamb_2012,BF-09_Houet_gamb_2012_07,BF-09_Houet_gamb_2012_Q3,3
1,AB0086-Cx,BF2-6,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F,...,BF-09,Houet,gambiae,BF-09_gamb_2012,BF-09_gamb_2012_07,BF-09_gamb_2012_Q3,BF-09_Houet_gamb_2012,BF-09_Houet_gamb_2012_07,BF-09_Houet_gamb_2012_Q3,3
2,AB0087-C,BF3-3,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F,...,BF-09,Houet,coluzzii,BF-09_colu_2012,BF-09_colu_2012_07,BF-09_colu_2012_Q3,BF-09_Houet_colu_2012,BF-09_Houet_colu_2012_07,BF-09_Houet_colu_2012_Q3,3
3,AB0088-C,BF3-5,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F,...,BF-09,Houet,coluzzii,BF-09_colu_2012,BF-09_colu_2012_07,BF-09_colu_2012_Q3,BF-09_Houet_colu_2012,BF-09_Houet_colu_2012_07,BF-09_Houet_colu_2012_Q3,3
4,AB0089-Cx,BF3-8,Austin Burt,Burkina Faso,Bana Village,2012,7,11.233,-4.472,F,...,BF-09,Houet,coluzzii,BF-09_colu_2012,BF-09_colu_2012_07,BF-09_colu_2012_Q3,BF-09_Houet_colu_2012,BF-09_Houet_colu_2012_07,BF-09_Houet_colu_2012_Q3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,AB0280-Cx,BF12-31,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F,...,BF-09,Houet,gambiae,BF-09_gamb_2012,BF-09_gamb_2012_07,BF-09_gamb_2012_Q3,BF-09_Houet_gamb_2012,BF-09_Houet_gamb_2012_07,BF-09_Houet_gamb_2012_Q3,3
177,AB0281-Cx,BF12-32,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F,...,BF-09,Houet,gambiae,BF-09_gamb_2012,BF-09_gamb_2012_07,BF-09_gamb_2012_Q3,BF-09_Houet_gamb_2012,BF-09_Houet_gamb_2012_07,BF-09_Houet_gamb_2012_Q3,3
178,AB0282-Cx,BF12-33,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F,...,BF-09,Houet,coluzzii,BF-09_colu_2012,BF-09_colu_2012_07,BF-09_colu_2012_Q3,BF-09_Houet_colu_2012,BF-09_Houet_colu_2012_07,BF-09_Houet_colu_2012_Q3,3
179,AB0283-C,BF10-12,Austin Burt,Burkina Faso,Pala,2012,7,11.151,-4.235,F,...,BF-09,Houet,gambiae,BF-09_gamb_2012,BF-09_gamb_2012_07,BF-09_gamb_2012_Q3,BF-09_Houet_gamb_2012,BF-09_Houet_gamb_2012_07,BF-09_Houet_gamb_2012_Q3,3


In [7]:
# check the quarter logic
df_samples.groupby("quarter").agg({'month': lambda v: set(v)})

Unnamed: 0_level_0,month
quarter,Unnamed: 1_level_1
3,{7}


In [8]:
cohorts_col = "cohort_admin2_quarter"

In [9]:
def make_cohort_label(row):
    # N.B., not all cohorts have a quarter defined, because samples were not provided
    # with collection month in the metadata. In this case we expect to fall back to
    # year.
    if row.quarter > 0:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year} / Q{row.quarter}"
    else:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year}"

In [10]:
df_cohorts_selected = (
    df_samples
    # N.B., only include females, otherwise data on X chromosome will be wonky
    .query("sex_call == 'F'")
    .groupby(cohorts_col).agg({
        'sample_id': 'count',
        'country': 'first',
        'admin1_iso': 'first',
        'admin1_name': 'first',
        'admin2_name': 'first',
        'taxon': 'first',
        'year': 'first',
        'quarter': 'first',
    })
    .reset_index()
    .rename(columns={
        'sample_id': 'cohort_size',
        cohorts_col: 'cohort_id',
    })
    .query(f'cohort_size >= {config["min_cohort_size"]}')
)
df_cohorts_selected['cohort_label'] = df_cohorts_selected.apply(
    make_cohort_label,
    axis="columns",
)
df_cohorts_selected['sample_query'] = df_cohorts_selected.apply(
    # N.B., only include females, otherwise data on X chromosome will be wonky
    lambda row: f"{cohorts_col} == '{row.cohort_id}' and sex_call == 'F'",
    axis="columns",
)
df_cohorts_selected

Unnamed: 0,cohort_id,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query
0,BF-09_Houet_colu_2012_Q3,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,3,Burkina Faso / Houet / coluzzii / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
1,BF-09_Houet_gamb_2012_Q3,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,3,Burkina Faso / Houet / gambiae / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...


In [20]:
## Add average latitude and longitude for each cohort for plotting.
## May want to use different approach, but mean OK for very small scales (which our cohorts usually are)

for idx, row in df_cohorts_selected.iterrows():
    print(row['cohort_id'])
    
    df = df_samples.query(f"cohort_admin2_quarter == '{row['cohort_id']}'")
    lat_mean = df['latitude'].mean()
    long_mean = df['longitude'].mean()
    df_cohorts_selected.loc[idx, 'latitude'] = lat_mean
    df_cohorts_selected.loc[idx, 'longitude'] = long_mean

BF-09_Houet_colu_2012_Q3
BF-09_Houet_gamb_2012_Q3


In [23]:
df_cohorts_selected.to_csv("build/cohorts.csv", index=False)