# H12 genome-wide selection scans

In [None]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.
cohort_id = "BF-09_Houet_gamb_2012_Q3"
cohorts_analysis = "20230223"
contigs = ["3L"]
sample_sets = "3.0"
min_cohort_size = 20
max_cohort_size = 50
dask_scheduler = "threads"
analysis_version = "dev"

## Setup

In [None]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import dask

dask.config.set(scheduler=dask_scheduler);

In [None]:
sample_sets

In [None]:
ag3 = malariagen_data.Ag3(
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=cohorts_analysis,
    results_cache=(here() / "results" / "malariagen_data_cache").as_posix(),
)
ag3

In [None]:
# load window sizes
calibration_dir = f"results/{analysis_version}/analysis/h12-calibration"
with open(here() / calibration_dir / f"{cohort_id}.yaml") as calibration_file:
    calibration_params = yaml.safe_load(calibration_file)
window_size = calibration_params["h12_window_size"]
window_size

In [None]:
# load cohorts to find sample query
df_cohorts = pd.read_csv(
    here() / "results" / analysis_version / "analysis" / "final_cohorts.csv"
).set_index("cohort_id")
cohort = df_cohorts.loc[cohort_id]
cohort

In [None]:
sample_query = cohort.sample_query
sample_query

In [None]:
if cohort.taxon == "arabiensis":
    phasing_analysis = "arab"
else:
    phasing_analysis = "gamb_colu"
phasing_analysis

## Run GWSS

In [None]:
for contig in contigs:
    print(f"running {contig}")
    ag3.plot_h12_gwss(
        contig=contig,
        window_size=window_size,
        analysis=phasing_analysis,
        sample_sets=sample_sets,
        sample_query=sample_query,
        min_cohort_size=min_cohort_size,
        max_cohort_size=max_cohort_size,
    )

N.B., results of the selection scans will be automatically saved into the malariagen_data results cache.