# G123 window size calibration

In [1]:
# Notebook parameters. Values here are for development only and 
# will be overridden when running via snakemake and papermill.
cohort_id = 'BF-09_Houet_colu_2012_Q3'
cohorts_analysis="20230223"
contigs = ['2L']
sample_sets = "3.0"
min_cohort_size = 20
max_cohort_size = 50
h12_calibration_contig = '3L'
use_gcs_cache = False
dask_scheduler = "threads"

## Setup

In [2]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os
import dask
dask.config.set(scheduler=dask_scheduler);

In [3]:
sample_sets

'3.0'

In [4]:
extra_params = dict()
if use_gcs_cache:
    extra_params["url"] = "simplecache::gs://vo_agam_release"
    extra_params["simplecache"] = dict(cache_storage=(here() / "gcs_cache").as_posix())

ag3 = malariagen_data.Ag3(
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=cohorts_analysis,
    results_cache=(here() / "malariagen_data_cache").as_posix(),
    **extra_params,
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,gs://vo_agam_release/
Data releases available,3.0
Results cache,/home/sanj/projects/selection-atlas/malariagen_data_cache
Cohorts analysis,20230223
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.5.0
Client location,unknown


In [5]:
df_cohorts = pd.read_csv(here() / "build" / "cohorts.csv").set_index("cohort_id")
df_cohorts.head()

Unnamed: 0_level_0,cohort_size,country,admin1_iso,admin1_name,admin2_name,taxon,year,quarter,cohort_label,sample_query,latitude,longitude
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ML-2_Kati_colu_2014_Q3,27,Mali,ML-2,Koulikouro,Kati,coluzzii,2014,3,Mali / Kati / coluzzii / 2014 / Q3,cohort_admin2_quarter == 'ML-2_Kati_colu_2014_...,12.875556,-8.137778
ML-2_Kati_gamb_2014_Q3,24,Mali,ML-2,Koulikouro,Kati,gambiae,2014,3,Mali / Kati / gambiae / 2014 / Q3,cohort_admin2_quarter == 'ML-2_Kati_gamb_2014_...,12.888788,-8.149091


In [6]:
cohort = df_cohorts.loc[cohort_id]
cohort

KeyError: 'BF-09_Houet_colu_2012_Q3'

In [None]:
# determine the phasing analysis to use
cohort.taxon

In [None]:
sample_query = cohort.sample_query
sample_query

In [None]:
if cohort.taxon == 'arabiensis':
    site_mask = 'arab'
else:
    site_mask = 'gamb_colu'
site_mask

In [None]:
contig = h12_calibration_contig
contig

In [None]:
window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)
window_sizes

## Run calibration

In [None]:
ag3.plot_g123_calibration(
    contig=h12_calibration_contig,
    site_mask=site_mask,
    sample_sets=sample_sets,
    sample_query=sample_query,
    min_cohort_size=min_cohort_size,
    max_cohort_size=max_cohort_size,
    window_sizes=window_sizes,
);

In [None]:
calibration_runs = ag3.g123_calibration(
    contig=h12_calibration_contig,
    site_mask=site_mask,
    sample_sets=sample_sets,
    sample_query=sample_query,
    min_cohort_size=min_cohort_size,
    max_cohort_size=max_cohort_size,
    window_sizes=window_sizes,
)
calibration_runs

In [None]:
selected_window_size = None
for window_size in window_sizes:
    x = calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < 0.1:
        selected_window_size = window_size
        break
selected_window_size

## Write outputs

In [None]:
outdir = "build/g123-calibration"
os.makedirs(outdir, exist_ok=True)

In [None]:
output = {
    "g123_window_size": selected_window_size
}
with open(os.path.join(outdir, f"{cohort_id}.yaml"), mode="w") as output_file:
    yaml.safe_dump(output, output_file)