In [1]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here

In [2]:
with open(here() / "workflow" / "params.yaml") as params_file:
    params = yaml.safe_load(params_file)
params

{'min_cohort_size': 20,
 'max_cohort_size': 50,
 'ag': {'sample_sets': ['3.0'],
  'contigs': ['2R', '3R', 'X'],
  'cohorts_analysis': '20230223',
  'h12_calibration_contig': '3L'}}

In [3]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    # url = "gs://vo_agam_release",
    url="simplecache::gs://vo_agam_release",
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=params["ag"]["cohorts_analysis"],
    # TODO remove simplecache config in production
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/aliman/github/anopheles-genomics-surveillance/selection-atlas/malariagen_data_cache
Cohorts analysis,20230223
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.3.0
Client location,"England, GB"


In [4]:
df_samples = ag3.sample_metadata(sample_sets=params["ag"]["sample_sets"])

Load sample metadata:   0%|          | 0/28 [00:00<?, ?it/s]

In [5]:
def month_to_quarter(row):
    return ((row.month - 1) // 3) + 1 if row.month > 0 else -1

In [6]:
# add a "quarter" column for convenience
df_samples["quarter"] = df_samples.apply(
    month_to_quarter,
    axis="columns"
)
df_samples

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,...,admin1_iso,admin2_name,taxon,cohort_admin1_year,cohort_admin1_month,cohort_admin1_quarter,cohort_admin2_year,cohort_admin2_month,cohort_admin2_quarter,quarter
0,AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_colu_2009_Q2,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04,AO-LUA_Luanda_colu_2009_Q2,2
1,AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_colu_2009_Q2,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04,AO-LUA_Luanda_colu_2009_Q2,2
2,AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_colu_2009_Q2,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04,AO-LUA_Luanda_colu_2009_Q2,2
3,AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_colu_2009_Q2,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04,AO-LUA_Luanda_colu_2009_Q2,2
4,AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_colu_2009_Q2,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04,AO-LUA_Luanda_colu_2009_Q2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3076,AD0494-C,80-2-o-16,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,F,...,,,unassigned,,,,,,,-1
3077,AD0495-C,80-2-o-17,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,M,...,,,unassigned,,,,,,,-1
3078,AD0496-C,80-2-o-18,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,M,...,,,unassigned,,,,,,,-1
3079,AD0497-C,80-2-o-19,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,F,...,,,unassigned,,,,,,,-1


In [7]:
# check the quarter logic
df_samples.groupby("quarter").agg({'month': lambda v: set(v)})

Unnamed: 0_level_0,month
quarter,Unnamed: 1_level_1
-1,{-1}
1,"{1, 2, 3}"
2,"{4, 5, 6}"
3,"{8, 9, 7}"
4,"{10, 11, 12}"


In [8]:
cohorts_col = "cohort_admin2_quarter"

In [9]:
def make_cohort_label(row):
    # N.B., not all cohorts have a quarter defined, because samples were not provided
    # with collection month in the metadata. In this case we expect to fall back to
    # year.
    if row.quarter > 0:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year} / Q{row.quarter}"
    else:
        return f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year}"

In [10]:
df_cohorts_selected = (
    df_samples
    # N.B., only include females, otherwise data on X chromosome will be wonky
    .query("sex_call == 'F'")
    .groupby(cohorts_col).agg({
        'sample_id': 'count',
        'country': 'first',
        'admin1_iso': 'first',
        'admin1_name': 'first',
        'admin2_name': 'first',
        'taxon': 'first',
        'year': 'first',
        'quarter': 'first',
    })
    .reset_index()
    .rename(columns={
        'sample_id': 'cohort_size',
        cohorts_col: 'cohort_id',
    })
    .query(f'cohort_size >= {params["min_cohort_size"]}')
)
df_cohorts_selected['cohort_label'] = df_cohorts_selected.apply(
    make_cohort_label,
    axis="columns",
)
df_cohorts_selected['sample_query'] = df_cohorts_selected.apply(
    # N.B., only include females, otherwise data on X chromosome will be wonky
    lambda row: f"{cohorts_col} == '{row.cohort_id}' and sex_call == 'F'",
    axis="columns",
)
df_cohorts_selected

Unnamed: 0,cohort_id,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query
0,AO-LUA_Luanda_colu_2009_Q2,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,2,Angola / Luanda / coluzzii / 2009 / Q2,cohort_admin2_quarter == 'AO-LUA_Luanda_colu_2...
4,BF-09_Houet_colu_2012_Q3,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,3,Burkina Faso / Houet / coluzzii / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
5,BF-09_Houet_colu_2014_Q3,32,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,3,Burkina Faso / Houet / coluzzii / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
6,BF-09_Houet_gamb_2012_Q3,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,3,Burkina Faso / Houet / gambiae / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...
7,BF-09_Houet_gamb_2014_Q3,41,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,3,Burkina Faso / Houet / gambiae / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...
8,CD-NU_Gbadolite_gamb_2015_Q3,44,Democratic Republic of the Congo,CD-NU,Nord-Ubangi,Gbadolite,gambiae,2015,3,Democratic Republic of the Congo / Gbadolite /...,cohort_admin2_quarter == 'CD-NU_Gbadolite_gamb...
12,CF-BGF_Bangui_gamb_1994_Q1,53,Central African Republic,CF-BGF,Bangui,Bangui,gambiae,1994,1,Central African Republic / Bangui / gambiae / ...,cohort_admin2_quarter == 'CF-BGF_Bangui_gamb_1...
13,CI-LG_Agneby-Tiassa_colu_2012,80,Cote d'Ivoire,CI-LG,Lagunes,Agneby-Tiassa,coluzzii,2012,-1,Cote d'Ivoire / Agneby-Tiassa / coluzzii / 2012,cohort_admin2_quarter == 'CI-LG_Agneby-Tiassa_...
27,CM-ES_Haut-Nyong_gamb_2009_Q3,95,Cameroon,CM-ES,East,Haut-Nyong,gambiae,2009,3,Cameroon / Haut-Nyong / gambiae / 2009 / Q3,cohort_admin2_quarter == 'CM-ES_Haut-Nyong_gam...
28,CM-ES_Lom-Et-Djérem_gamb_2009_Q3,163,Cameroon,CM-ES,East,Lom-Et-Djérem,gambiae,2009,3,Cameroon / Lom-Et-Djérem / gambiae / 2009 / Q3,cohort_admin2_quarter == 'CM-ES_Lom-Et-Djérem_...


In [11]:
df_cohorts_selected.to_csv(here() / "build/cohorts.csv", index=False)