In [1]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here

In [2]:
with open(here() / "workflow" / "params.yaml") as params_file:
    params = yaml.safe_load(params_file)
params

{'min_cohort_size': 20, 'releases': {'ag': ['3.0']}}

In [3]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    url="simplecache::gs://vo_agam_release",
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/aliman/github/anopheles-genomics-surveillance/selection-atlas/malariagen_data_cache
Cohorts analysis,20220608
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.3.0
Client location,"England, GB"


In [4]:
df_samples = ag3.sample_metadata(sample_sets=params["releases"]["ag"])
df_samples

Load sample metadata:   0%|          | 0/28 [00:00<?, ?it/s]

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,...,aim_species,country_iso,admin1_name,admin1_iso,admin2_name,taxon,cohort_admin1_year,cohort_admin1_month,cohort_admin2_year,cohort_admin2_month
0,AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,coluzzii,AGO,Luanda,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04
1,AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,coluzzii,AGO,Luanda,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04
2,AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,coluzzii,AGO,Luanda,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04
3,AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,coluzzii,AGO,Luanda,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04
4,AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,...,coluzzii,AGO,Luanda,AO-LUA,Luanda,coluzzii,AO-LUA_colu_2009,AO-LUA_colu_2009_04,AO-LUA_Luanda_colu_2009,AO-LUA_Luanda_colu_2009_04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3076,AD0494-C,80-2-o-16,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,F,...,intermediate_gambiae_coluzzii,,,,,intermediate_gambiae_coluzzii,,,,
3077,AD0495-C,80-2-o-17,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,M,...,intermediate_gambiae_coluzzii,,,,,intermediate_gambiae_coluzzii,,,,
3078,AD0496-C,80-2-o-18,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,M,...,intermediate_gambiae_coluzzii,,,,,intermediate_gambiae_coluzzii,,,,
3079,AD0497-C,80-2-o-19,Martin Donnelly,Lab Cross,LSTM,-1,-1,53.409,-2.969,F,...,intermediate_gambiae_coluzzii,,,,,intermediate_gambiae_coluzzii,,,,


In [5]:
# TODO change this to quarter when available
cohorts_col = "cohort_admin2_year"

In [6]:
df_cohorts_selected = (
    df_samples
        .groupby(cohorts_col).agg({
        'sample_id': 'count',
        'country': 'first',
        'admin1_iso': 'first',
        'admin1_name': 'first',
        'admin2_name': 'first',
        'taxon': 'first',
        'year': 'first',
    })
    .reset_index()
    .rename(columns={
        'sample_id': 'cohort_size',
        cohorts_col: 'cohort_id',
    })
    .query(f'cohort_size >= {params["min_cohort_size"]}')
)
df_cohorts_selected['cohort_label'] = df_cohorts_selected.apply(
    lambda row: f"{row.country} / {row.admin2_name} / {row.taxon} / {row.year}",
    axis="columns",
)
df_cohorts_selected['sample_query'] = df_cohorts_selected.apply(
    lambda row: f"{cohorts_col} == '{row.cohort_id}'",
    axis="columns",
)
df_cohorts_selected

Unnamed: 0,cohort_id,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,cohort_label,sample_query
0,AO-LUA_Luanda_colu_2009,81,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,Angola / Luanda / coluzzii / 2009,cohort_admin2_year == 'AO-LUA_Luanda_colu_2009'
3,BF-09_Houet_colu_2012,82,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,Burkina Faso / Houet / coluzzii / 2012,cohort_admin2_year == 'BF-09_Houet_colu_2012'
4,BF-09_Houet_colu_2014,53,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,Burkina Faso / Houet / coluzzii / 2014,cohort_admin2_year == 'BF-09_Houet_colu_2014'
5,BF-09_Houet_gamb_2012,98,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,Burkina Faso / Houet / gambiae / 2012,cohort_admin2_year == 'BF-09_Houet_gamb_2012'
6,BF-09_Houet_gamb_2014,46,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,Burkina Faso / Houet / gambiae / 2014,cohort_admin2_year == 'BF-09_Houet_gamb_2014'
7,CD-NU_Gbadolite_gamb_2015,76,Democratic Republic of the Congo,CD-NU,Nord-Ubangi,Gbadolite,gambiae,2015,Democratic Republic of the Congo / Gbadolite /...,cohort_admin2_year == 'CD-NU_Gbadolite_gamb_2015'
11,CF-BGF_Bangui_gamb_1994,53,Central African Republic,CF-BGF,Bangui,Bangui,gambiae,1994,Central African Republic / Bangui / gambiae / ...,cohort_admin2_year == 'CF-BGF_Bangui_gamb_1994'
12,CI-LG_Agneby-Tiassa_colu_2012,80,Cote d'Ivoire,CI-LG,Lagunes,Agneby-Tiassa,coluzzii,2012,Cote d'Ivoire / Agneby-Tiassa / coluzzii / 2012,cohort_admin2_year == 'CI-LG_Agneby-Tiassa_col...
26,CM-ES_Haut-Nyong_gamb_2009,110,Cameroon,CM-ES,East,Haut-Nyong,gambiae,2009,Cameroon / Haut-Nyong / gambiae / 2009,cohort_admin2_year == 'CM-ES_Haut-Nyong_gamb_2...
27,CM-ES_Lom-Et-Djérem_gamb_2009,193,Cameroon,CM-ES,East,Lom-Et-Djérem,gambiae,2009,Cameroon / Lom-Et-Djérem / gambiae / 2009,cohort_admin2_year == 'CM-ES_Lom-Et-Djérem_gam...


In [7]:
df_cohorts_selected.to_csv("../cohorts.csv", index=False)