In [1]:
# parameters
cohort_id = 'AO-LUA_Luanda_colu_2009'

In [2]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os

In [4]:
# load configuration
with open(here() / "workflow" / "config.yaml") as config_file:
    config = yaml.safe_load(config_file)
config

{'min_cohort_size': 20,
 'max_cohort_size': 50,
 'ag': {'sample_sets': ['3.0'],
  'contigs': ['2R', '3R', 'X'],
  'h12_calibration_contig': '3L'}}

In [3]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    # url = "gs://vo_agam_release",
    url="simplecache::gs://vo_agam_release",
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=config["ag"]["cohorts_analysis"],
    # TODO remove simplecache config in production
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/aliman/github/anopheles-genomics-surveillance/selection-atlas/malariagen_data_cache
Cohorts analysis,20220608
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.3.0
Client location,"England, GB"


In [5]:
# load window sizes 
calibration_dir = here() / "build" / "h12-calibration"
with open(calibration_dir / f"{cohort_id}.yaml") as calibration_file:
    calibration_params = yaml.safe_load(calibration_file)
calibration_params

{'h12_window_size': 5000}

In [6]:
# load cohorts to find sample query 
df_cohorts = pd.read_csv(here() / "build" / "cohorts.csv").set_index("cohort_id")
cohort = df_cohorts.loc[cohort_id]
cohort

cohort_size                                                    77
country                                                    Angola
admin1_iso                                                 AO-LUA
admin1_name                                                Luanda
admin2_name                                                Luanda
taxon                                                    coluzzii
year                                                         2009
cohort_label                    Angola / Luanda / coluzzii / 2009
sample_query    cohort_admin2_year == 'AO-LUA_Luanda_colu_2009...
Name: AO-LUA_Luanda_colu_2009, dtype: object

In [7]:
sample_query = cohort.sample_query
sample_query

"cohort_admin2_year == 'AO-LUA_Luanda_colu_2009' and sex_call == 'F'"

In [8]:
if cohort.taxon == 'arabiensis':
    phasing_analysis = 'arab'
else:
    phasing_analysis = 'gamb_colu'
phasing_analysis

'gamb_colu'

In [9]:
if cohort.cohort_size > config['max_cohort_size']:
    # downsampling for computational efficiency
    cohort_size = config['max_cohort_size']
else:
    # no downsampling
    cohort_size = None 
cohort_size

50

In [11]:
for contig in config['ag']['contigs']:
    ag3.plot_h12_gwss(
        contig=contig, 
        window_size=calibration_params['h12_window_size'], 
        analysis=phasing_analysis, 
        sample_sets=config['ag']['sample_sets'],
        sample_query=sample_query, 
        cohort_size=cohort_size
    )