In [None]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

config_file = "../../../config/agam.yaml"

In [None]:
from bokeh.io import output_notebook
from IPython.display import Markdown
from selection_atlas.setup import AtlasSetup
from selection_atlas.page_utils import AtlasPageUtils

# Initialise the atlas setup.
setup = AtlasSetup(config_file)
page_utils = AtlasPageUtils(setup=setup)

# N.B., do not add the "remove-output" tag to this cell!!! If you do,
# the bokeh javascript libraries will not get loaded in the generated
# HTML page. The call to output_notebook() injects javascript in the
# cell output which triggers the bokeh javascript libraries to be loaded
# in the page.
output_notebook(hide_banner=True)

# Data sources

In [None]:
df_samples = setup.sample_metadata()
df_samples.columns

In [None]:
countries = df_samples["country"].unique()

In [None]:
Markdown(f"""
This report analyses data from the {{term}}`Malaria Vector Genome Observatory`. The 
current analysis version is {setup.atlas_id}/{setup.analysis_version} which includes
data for a total of {len(df_samples):,} mosquitoes sampled from {len(countries)} countries. Please 
see the table below for more information about the sample sets and releases from which the data 
were obtained.
""")

In [None]:
release_prefix = {
    "agam": "Ag",
    "afun": "Af",
}[setup.atlas_id]


def make_clickable_study(row):
    study_url = row["study_url"]
    study_url = study_url.split(", ")[
        0
    ]  # deal with campos-2021 which has multiple URLs
    study_id = row["study_id"]
    return f'<a href="{study_url}" target="_blank">{study_id}</a>'


def make_clickable_release(row):
    release = release_prefix + row["release"]
    url = f"https://malariagen.github.io/vector-data/{release[:3].lower()}/{release.lower()}.html"
    return f'<a href="{url}" rel="noopener noreferrer" target="_blank">{release}</a>'


df_sources = (
    df_samples[["sample_set", "study_id", "study_url", "contributor", "release"]]
    .groupby("sample_set")
    .agg(
        {
            "study_id": "first",
            "study_url": "first",
            "contributor": lambda x: ", ".join(
                set(x)
            ),  # some sample sets have mixed contributors
            "release": "first",
        }
    )
    .reset_index(drop=False)
)

# Get proper ordering of releases.
release_split = (
    df_sources["release"].str.split(".").apply(lambda x: tuple([int(i) for i in x]))
)
df_sources["release_split"] = release_split

# Make links clickable.
df_sources["study_id"] = df_sources.apply(make_clickable_study, axis="columns")
df_sources["release"] = df_sources.apply(make_clickable_release, axis="columns")
df_sources_style = (
    df_sources.sort_values(["release_split", "sample_set"])[
        ["sample_set", "study_id", "contributor", "release"]
    ]
    .rename(
        {
            "sample_set": "Sample Set",
            "study_id": "Study",
            "contributor": "Contributor",
            "release": "Data Release",
        },
        axis="columns",
    )
    .style.set_caption(
        "Table 1. Malaria Vector Genome Observatory sample sets used for the current analysis."
    )
    .hide(axis="index")
)
df_sources_style