# H12 and G123 window size calibration

In [None]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

config_file = "../../../config/agam.yaml"
cohort_id = "BF-09_Houet_colu_2012_Q3"

## Setup

In [None]:
import pandas as pd
import numpy as np
import yaml
from selection_atlas.setup import AtlasSetup

# Initialise the atlas setup.
setup = AtlasSetup(config_file)

In [None]:
df_cohorts = pd.read_csv(setup.cohorts_file).set_index("cohort_id")
df_cohorts.head()

In [None]:
cohort = df_cohorts.loc[cohort_id]
cohort

In [None]:
# determine the phasing analysis to use
cohort.taxon

In [None]:
cohort_query = cohort.sample_query
cohort_query

In [None]:
phasing_analysis = setup.taxon_phasing_analysis[cohort.taxon]
phasing_analysis

In [None]:
site_mask = setup.taxon_site_mask[cohort.taxon]
site_mask

## H12 calibration

In [None]:
setup.malariagen_api.plot_h12_calibration(
    contig=setup.h12_calibration_contig,
    analysis=phasing_analysis,
    sample_sets=setup.sample_sets,
    sample_query=cohort_query,
    min_cohort_size=setup.min_cohort_size,
    max_cohort_size=setup.max_cohort_size,
    window_sizes=setup.h12_calibration_window_sizes,
);

In [None]:
h12_calibration_runs = setup.malariagen_api.h12_calibration(
    contig=setup.h12_calibration_contig,
    analysis=phasing_analysis,
    sample_sets=setup.sample_sets,
    sample_query=cohort_query,
    min_cohort_size=setup.min_cohort_size,
    max_cohort_size=setup.max_cohort_size,
    window_sizes=setup.h12_calibration_window_sizes,
)
h12_calibration_runs

In [None]:
h12_selected_window_size = None
for window_size in setup.h12_calibration_window_sizes:
    x = h12_calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < setup.h12_calibration_threshold:
        h12_selected_window_size = window_size
        break
h12_selected_window_size

## G123 calibration

In [None]:
setup.malariagen_api.plot_g123_calibration(
    contig=setup.g123_calibration_contig,
    site_mask=site_mask,
    sites=phasing_analysis,
    sample_sets=setup.sample_sets,
    sample_query=cohort_query,
    min_cohort_size=setup.min_cohort_size,
    max_cohort_size=setup.max_cohort_size,
    window_sizes=setup.g123_calibration_window_sizes,
);

In [None]:
g123_calibration_runs = setup.malariagen_api.g123_calibration(
    contig=setup.g123_calibration_contig,
    site_mask=site_mask,
    sites=phasing_analysis,
    sample_sets=setup.sample_sets,
    sample_query=cohort_query,
    min_cohort_size=setup.min_cohort_size,
    max_cohort_size=setup.max_cohort_size,
    window_sizes=setup.g123_calibration_window_sizes,
)
g123_calibration_runs

In [None]:
g123_selected_window_size = None
for window_size in setup.g123_calibration_window_sizes:
    x = g123_calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < setup.g123_calibration_threshold:
        g123_selected_window_size = window_size
        break
g123_selected_window_size

## Write outputs

In [None]:
output = {
    "h12_window_size": h12_selected_window_size,
    "g123_window_size": g123_selected_window_size,
}
calibration_file = setup.calibration_files.as_posix().format(cohort=cohort_id)
with open(calibration_file, mode="w") as output_file:
    yaml.safe_dump(output, output_file)