In [None]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

config_file = "../../../config/agam.yaml"
# config_file = "../../../config/afun.yaml"

In [None]:
from bokeh.io import output_notebook
import malariagen_data
from IPython.display import Markdown
from selection_atlas.setup import AtlasSetup
from selection_atlas.page_utils import AtlasPageUtils

# Initialise the atlas setup.
setup = AtlasSetup(config_file)
page_utils = AtlasPageUtils(setup=setup)

# N.B., do not add the "remove-output" tag to this cell!!! If you do,
# the bokeh javascript libraries will not get loaded in the generated
# HTML page. The call to output_notebook() injects javascript in the
# cell output which triggers the bokeh javascript libraries to be loaded
# in the page.
output_notebook(hide_banner=True)

# Methods

## Data sources

In [None]:
df_samples = setup.sample_metadata()
countries = df_samples["country"].unique()

In [None]:
Markdown(f"""
This report analyses genome variation data from the 
{{term}}`Malaria Vector Genome Observatory`. See Table 1 below for a 
complete list of the sample sets used in the current analysis version, 
with information about the corresponding contributors, data releases 
and citations. These sample sets provide data for a total of
{len(df_samples):,} mosquitoes sampled from {len(countries)} countries. 
""")

In [None]:
page_utils.style_data_sources(
    df_samples=df_samples,
    caption="Table 1. Data sources included in the current analysis version.",
)

In [None]:
Markdown(f"""
Sample metadata, unphased SNP calls, and phased SNP haplotypes were retrieved from 
the {{term}}`Malaria Vector Genome Observatory` cloud data repository hosted in 
Google Cloud Storage (GCS) via the {{term}}`MalariaGEN Python API` version 
{malariagen_data.__version__}.
""")

## Sample inclusion and grouping into cohorts

Samples were considered for inclusion if they met the following criteria:

In [None]:
def human_readable_list(x):
    if len(x) > 1:
        output = ", ".join(x[:-1]) + " or " + x[-1]
    else:
        output = x[0]
    return output


readable_taxa = human_readable_list(setup.taxa)

In [None]:
Markdown(f"""
* Gender assigned as female via comparison of sequence coverage on autosomes and sex 
  chromosomes. 
* Taxon assigned as {readable_taxa} via principle components analysis of 
  genomic data from Chromosome 3 and comparison with reference samples 
  with known taxon assignments.
""")

In [None]:
gdf_cohorts = page_utils.gdf_cohorts

After filtering according to these inclusion criteria, samples were grouped into cohorts by taxon, location of sampling and date of sampling. Samples were grouped spatially if their collection locations were within the same level 2 administrative unit, according to geoBoundaries version 5.0.0. Samples were grouped temporally if their collection dates were within the same quarter (3 month period) where possible, except in a small number of cases where metadata were only available on year of collection.

In [None]:
Markdown(f"""
Cohorts were excluded from the analysis if the sample size was 
less than {setup.min_cohort_size}. Cohorts with more than 
{setup.max_cohort_size} samples were randomly downsampled for
computational efficiency. Cohorts were also excluded from the 
analysis if they failed H12 or G123 window size calibration
(see below). After applying these filters, a total of 
{len(gdf_cohorts)} cohorts were retained for analysis (Table 2).
""")

In [None]:
page_utils.style_cohorts_table(
    gdf_cohorts,
    caption="Table 2. Cohorts selected for genome-wide selection scan analyses.",
    url_prefix="",
)

## H12 and G123 window size calibration

TODO how was window-size calibration done?

TODO after calibration, some cohorts removed if cannot get a window-size.

## H12 genome-wide selection scans

TODO

## G123 genome-wide selection scans

TODO

## IHS genome-wide selection scans

TODO

## Automated detection of selection signals

TODO

## Identification of selection alerts

TODO

## Web report generation

TODO