# H12 genome-wide selection scans

## Notebook parameters

In [1]:
cohort_id = 'BF-09_Houet_colu_2012_Q3'
contigs = ["2R"]
sample_sets = "AG1000G-BF-A"
cohorts_analysis = "20230223"
max_cohort_size = 50

## Setup

In [3]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os

In [5]:
sample_sets

['AG1000G-BF-A']

In [6]:
ag3 = malariagen_data.Ag3(
    # TODO in production build, remove use of simplecache if running inside google cloud
    # url = "gs://vo_agam_release",
    url="simplecache::gs://vo_agam_release",
    # pin the version of the cohorts analysis for reproducibility
    cohorts_analysis=cohorts_analysis,
    # TODO remove simplecache config in production
    simplecache=dict(cache_storage=(here() / "gcs_cache").as_posix()),
    results_cache=(here() / "malariagen_data_cache").as_posix(),
)
ag3

MalariaGEN Ag3 API client,MalariaGEN Ag3 API client
"Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs.","Please note that data are subject to terms of use,  for more information see the MalariaGEN website or contact data@malariagen.net.  See also the Ag3 API docs..1"
Storage URL,simplecache::gs://vo_agam_release
Data releases available,3.0
Results cache,/home/aliman/github/anopheles-genomics-surveillance/selection-atlas/malariagen_data_cache
Cohorts analysis,20230223
Species analysis,aim_20220528
Site filters analysis,dt_20200416
Software version,malariagen_data 7.3.0
Client location,"England, GB"


In [9]:
# load window sizes 
calibration_dir = "build/h12-calibration"
with open(here() / calibration_dir / f"{cohort_id}.yaml") as calibration_file:
    calibration_params = yaml.safe_load(calibration_file)
window_size = calibration_params["h12_window_size"]
window_size

1000

In [10]:
# load cohorts to find sample query 
df_cohorts = pd.read_csv(here() / "build" / "final_cohorts.csv").set_index("cohort_id")
cohort = df_cohorts.loc[cohort_id]
cohort

cohort_size                                                    78
country                                              Burkina Faso
admin1_iso                                                  BF-09
admin1_name                                         Hauts-Bassins
admin2_name                                                 Houet
taxon                                                    coluzzii
year                                                         2012
quarter                                                         3
cohort_label          Burkina Faso / Houet / coluzzii / 2012 / Q3
sample_query    cohort_admin2_quarter == 'BF-09_Houet_colu_201...
Name: BF-09_Houet_colu_2012_Q3, dtype: object

In [11]:
sample_query = cohort.sample_query
sample_query

"cohort_admin2_quarter == 'BF-09_Houet_colu_2012_Q3' and sex_call == 'F'"

In [12]:
if cohort.taxon == 'arabiensis':
    phasing_analysis = 'arab'
else:
    phasing_analysis = 'gamb_colu'
phasing_analysis

'gamb_colu'

In [14]:
if cohort.cohort_size > max_cohort_size:
    # downsampling for computational efficiency
    cohort_size = max_cohort_size
else:
    # no downsampling
    cohort_size = None 
cohort_size

50

## Run GWSS

In [16]:
for contig in contigs:
    ag3.plot_h12_gwss(
        contig=contig, 
        window_size=window_size, 
        analysis=phasing_analysis, 
        sample_sets=sample_sets,
        sample_query=sample_query, 
        cohort_size=cohort_size
    )

N.B., results of the selection scans will be automatically saved into the malariagen_data results cache.