# H12 window size calibration

## Notebook parameters

In [1]:
cohort_id = 'BF-09_Houet_colu_2012_Q3'

## Setup

In [2]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os

In [3]:
with open("workflow/config.yaml") as config_file:
    workflow_config = yaml.safe_load(config_file)
workflow_config

{'min_cohort_size': 20,
 'max_cohort_size': 50,
 'ag': {'sample_sets': ['AG1000G-BF-A'],
  'contigs': ['2R'],
  'cohorts_analysis': '20230223',
  'h12_calibration_contig': '3L'}}

In [4]:
sample_sets = workflow_config["ag"]["sample_sets"]
sample_sets

['AG1000G-BF-A']

In [5]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    # url = "gs://vo_agam_release",
    url="simplecache::gs://vo_agam_release",
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=workflow_config["ag"]["cohorts_analysis"],
    # TODO remove simplecache config in production
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/aliman/github/anopheles-genomics-surveillance/selection-atlas/malariagen_data_cache
Cohorts analysis,20230223
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.3.0
Client location,"England, GB"


In [6]:
df_cohorts = pd.read_csv("build/cohorts.csv").set_index("cohort_id")
df_cohorts.head()

Unnamed: 0_level_0,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AO-LUA_Luanda_colu_2009_Q2,77,Angola,AO-LUA,Luanda,Luanda,coluzzii,2009,2,Angola / Luanda / coluzzii / 2009 / Q2,cohort_admin2_quarter == 'AO-LUA_Luanda_colu_2...
BF-09_Houet_colu_2012_Q3,78,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2012,3,Burkina Faso / Houet / coluzzii / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
BF-09_Houet_colu_2014_Q3,32,Burkina Faso,BF-09,Hauts-Bassins,Houet,coluzzii,2014,3,Burkina Faso / Houet / coluzzii / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_colu_201...
BF-09_Houet_gamb_2012_Q3,73,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2012,3,Burkina Faso / Houet / gambiae / 2012 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...
BF-09_Houet_gamb_2014_Q3,41,Burkina Faso,BF-09,Hauts-Bassins,Houet,gambiae,2014,3,Burkina Faso / Houet / gambiae / 2014 / Q3,cohort_admin2_quarter == 'BF-09_Houet_gamb_201...


In [7]:
cohort = df_cohorts.loc[cohort_id]
cohort

cohort_size                                                    78
country                                              Burkina Faso
admin1_iso                                                  BF-09
admin1_name                                         Hauts-Bassins
admin2_name                                                 Houet
taxon                                                    coluzzii
year                                                         2012
quarter                                                         3
cohort_label          Burkina Faso / Houet / coluzzii / 2012 / Q3
sample_query    cohort_admin2_quarter == 'BF-09_Houet_colu_201...
Name: BF-09_Houet_colu_2012_Q3, dtype: object

In [8]:
# determine the phasing analysis to use
cohort.taxon

'coluzzii'

In [14]:
sample_query = cohort.sample_query
sample_query

"cohort_admin2_quarter == 'BF-09_Houet_colu_2012_Q3' and sex_call == 'F'"

In [9]:
if cohort.taxon == 'arabiensis':
    phasing_analysis = 'arab'
else:
    phasing_analysis = 'gamb_colu'
phasing_analysis

'gamb_colu'

In [10]:
if cohort.cohort_size > workflow_config['max_cohort_size']:
    # downsampling for computational efficiency
    cohort_size = workflow_config['max_cohort_size']
else:
    # no downsampling
    cohort_size = None 
cohort_size

50

In [11]:
contig = workflow_config["ag"]["h12_calibration_contig"]
contig

'3L'

In [12]:
window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)

## Run calibration

In [15]:
ag3.plot_h12_calibration(
    contig=contig,
    analysis=phasing_analysis,
    sample_sets=sample_sets,
    sample_query=sample_query,
    cohort_size=cohort_size,
    window_sizes=window_sizes,
)

Load sample metadata:   0%|          | 0/1 [00:00<?, ?it/s]

Load haplotypes:   0%|          | 0/264 [00:00<?, ?it/s]

Compute H12:   0%|          | 0/8 [00:00<?, ?it/s]

In [16]:
calibration_runs = ag3.h12_calibration(
    contig=contig,
    analysis=phasing_analysis,
    sample_sets=sample_sets,
    sample_query=sample_query,
    cohort_size=cohort_size,
    window_sizes=window_sizes,
)
calibration_runs

<numpy.lib.npyio.NpzFile at 0x7f85041433a0>

In [17]:
selected_window_size = None
for window_size in window_sizes:
    x = calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < 0.1:
        selected_window_size = window_size
        break
selected_window_size

1000

## Write outputs

In [18]:
outdir = "build/h12-calibration"
os.makedirs(outdir, exist_ok=True)

In [19]:
output = {
    "h12_window_size": selected_window_size
}
with open(os.path.join(outdir, f"{cohort_id}.yaml"), mode="w") as output_file:
    yaml.safe_dump(output, output_file)
    