In [1]:
# parameters
cohort_id = 'AO-LUA_Luanda_colu_2009'

In [2]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os

In [3]:
with open(here() / "workflow" / "params.yaml") as params_file:
    params = yaml.safe_load(params_file)
params

{'min_cohort_size': 20,
 'max_cohort_size': 50,
 'ag': {'sample_sets': ['3.0'],
  'contigs': ['2R', '3R', 'X'],
  'h12_calibration_contig': '3L'}}

In [4]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    url="simplecache::gs://vo_agam_release",
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/aliman/github/anopheles-genomics-surveillance/selection-atlas/malariagen_data_cache
Cohorts analysis,20220608
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.3.0
Client location,"England, GB"


In [5]:
df_cohorts = pd.read_csv(here() / "build" / "cohorts.csv").set_index("cohort_id")
df_cohorts.head()

Unnamed: 0_level_0,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,cohort_label,sample_query
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AO-LUA_Luanda_colu_2009,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,Angola / Luanda / coluzzii / 2009,cohort_admin2_year == 'AO-LUA_Luanda_colu_2009...
BF-09_Houet_colu_2012,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,Burkina Faso / Houet / coluzzii / 2012,cohort_admin2_year == 'BF-09_Houet_colu_2012' ...
BF-09_Houet_colu_2014,32,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,Burkina Faso / Houet / coluzzii / 2014,cohort_admin2_year == 'BF-09_Houet_colu_2014' ...
BF-09_Houet_gamb_2012,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,Burkina Faso / Houet / gambiae / 2012,cohort_admin2_year == 'BF-09_Houet_gamb_2012' ...
BF-09_Houet_gamb_2014,41,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,Burkina Faso / Houet / gambiae / 2014,cohort_admin2_year == 'BF-09_Houet_gamb_2014' ...


In [6]:
cohort = df_cohorts.loc[cohort_id]
cohort

cohort_size                                                    77
country                                                    Angola
admin1_iso                                                 AO-LUA
admin1_name                                                Luanda
admin2_name                                                Luanda
taxon                                                    coluzzii
year                                                         2009
cohort_label                    Angola / Luanda / coluzzii / 2009
sample_query    cohort_admin2_year == 'AO-LUA_Luanda_colu_2009...
Name: AO-LUA_Luanda_colu_2009, dtype: object

In [7]:
# determine the phasing analysis to use
cohort.taxon

'coluzzii'

In [8]:
if cohort.taxon == 'arabiensis':
    phasing_analysis = 'arab'
else:
    phasing_analysis = 'gamb_colu'
phasing_analysis

'gamb_colu'

In [9]:
if cohort.cohort_size > params['max_cohort_size']:
    # downsampling for computational efficiency
    cohort_size = params['max_cohort_size']
else:
    # no downsampling
    cohort_size = None 
cohort_size

50

In [10]:
window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)

In [11]:
ag3.plot_h12_calibration(
    contig=params['ag']['h12_calibration_contig'],
    analysis=phasing_analysis,
    sample_sets=params['ag']['sample_sets'],
    sample_query=cohort.sample_query,
    cohort_size=cohort_size,
    window_sizes=window_sizes,
)

Load sample metadata:   0%|          | 0/28 [00:00<?, ?it/s]

Load haplotypes:   0%|          | 0/176 [00:00<?, ?it/s]

Compute H12:   0%|          | 0/8 [00:00<?, ?it/s]

In [13]:
calibration_runs = ag3.h12_calibration(
    contig=params['ag']['h12_calibration_contig'],
    analysis=phasing_analysis,
    sample_sets=params['ag']['sample_sets'],
    sample_query=cohort.sample_query,
    cohort_size=cohort_size,
    window_sizes=window_sizes,
)
calibration_runs

<numpy.lib.npyio.NpzFile at 0x7f92b6128850>

In [14]:
selected_window_size = None
for window_size in window_sizes:
    x = calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < 0.1:
        selected_window_size = window_size
        break
selected_window_size

5000

In [15]:
outdir = here() / "build" / "h12-calibration"
os.makedirs(outdir, exist_ok=True)

In [16]:
output = {
    "h12_window_size": selected_window_size
}
with open(outdir / f"{cohort_id}.yaml", mode="w") as output_file:
    yaml.safe_dump(output, output_file)
    