# H12 window size calibration

In [1]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.
cohort_id = "BF-09_Houet_colu_2012_Q3"
cohorts_analysis = "20230223"
contigs = ["2L"]
sample_sets = "3.0"
min_cohort_size = 20
max_cohort_size = 50
h12_calibration_contig = "3L"
dask_scheduler = "threads"
analysis_version = "dev"

## Setup

In [None]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os
import dask

dask.config.set(scheduler=dask_scheduler);

In [None]:
sample_sets

In [None]:
ag3 = malariagen_data.Ag3(
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=cohorts_analysis,
    results_cache=(here() / "results" / "malariagen_data_cache").as_posix(),
)
ag3

In [None]:
df_cohorts = pd.read_csv(
    here() / "results" / analysis_version / "analysis" / "cohorts.csv"
).set_index("cohort_id")
df_cohorts.head()

In [None]:
cohort = df_cohorts.loc[cohort_id]
cohort

In [None]:
# determine the phasing analysis to use
cohort.taxon

In [None]:
sample_query = cohort.sample_query
sample_query

In [None]:
if cohort.taxon == "arabiensis":
    phasing_analysis = "arab"
else:
    phasing_analysis = "gamb_colu"
phasing_analysis

In [None]:
contig = h12_calibration_contig
contig

In [None]:
window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)
window_sizes

## Run calibration

In [None]:
ag3.plot_h12_calibration(
    contig=h12_calibration_contig,
    analysis=phasing_analysis,
    sample_sets=sample_sets,
    sample_query=sample_query,
    min_cohort_size=min_cohort_size,
    max_cohort_size=max_cohort_size,
    window_sizes=window_sizes,
);

In [None]:
calibration_runs = ag3.h12_calibration(
    contig=h12_calibration_contig,
    analysis=phasing_analysis,
    sample_sets=sample_sets,
    sample_query=sample_query,
    min_cohort_size=min_cohort_size,
    max_cohort_size=max_cohort_size,
    window_sizes=window_sizes,
)
calibration_runs

In [None]:
selected_window_size = None
for window_size in window_sizes:
    x = calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < 0.1:
        selected_window_size = window_size
        break
selected_window_size

## Write outputs

In [None]:
outdir = f"results/{analysis_version}/analysis/h12-calibration"
os.makedirs(outdir, exist_ok=True)

In [None]:
output = {"h12_window_size": selected_window_size}
with open(os.path.join(outdir, f"{cohort_id}.yaml"), mode="w") as output_file:
    yaml.safe_dump(output, output_file)