# G123 genome-wide selection scans

In [None]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

# General setup parameters.
atlas_id = "ag"
analysis_version = "dev"
cohorts_analysis = "20240924"
contigs = ["3L"]
dask_scheduler = "single-threaded"

# Other parameters.
sample_sets = "AG1000G-BF-A"
sample_query = None
min_cohort_size = 20
max_cohort_size = 50
cohort_id = "BF-09_Houet_colu_2012_Q3"

## Setup

In [None]:
import yaml
import pandas as pd
from selection_atlas.setup import AtlasSetup

# Initialise the atlas setup.
setup = AtlasSetup(
    atlas_id=atlas_id,
    analysis_version=analysis_version,
    cohorts_analysis=cohorts_analysis,
    contigs=contigs,
    dask_scheduler=dask_scheduler,
)

In [None]:
sample_sets

In [None]:
# load window sizes
g123_calibration_file = setup.g123_calibration_files.as_posix().format(cohort=cohort_id)
with open(g123_calibration_file) as calibration_file:
    calibration_params = yaml.safe_load(calibration_file)
window_size = calibration_params["g123_window_size"]
window_size

In [None]:
# load cohorts to find sample query
df_cohorts = pd.read_csv(setup.final_cohorts_file).set_index("cohort_id")
cohort = df_cohorts.loc[cohort_id]
cohort

In [None]:
sample_query = cohort.sample_query
sample_query

In [None]:
if cohort.taxon == "arabiensis":
    sites = site_mask = "arab"
else:
    sites = site_mask = "gamb_colu"
site_mask

## Run GWSS

In [None]:
for contig in contigs:
    print(f"running {contig}")
    setup.malariagen_api.plot_g123_gwss(
        contig=contig,
        window_size=window_size,
        sites=sites,
        site_mask=site_mask,
        sample_sets=sample_sets,
        sample_query=sample_query,
        min_cohort_size=min_cohort_size,
        max_cohort_size=max_cohort_size,
    )

N.B., results of the selection scans will be automatically saved into the malariagen_data results cache.