In [1]:
# Notebook parameters. Values here are for development only and
# will be overridden when running via snakemake and papermill.

# cohort_id = "CD-NU_Gbadolite_gamb_2015_Q3"
cohort_id = "ML-2_Kati_colu_2014_Q3"
# cohort_id = 'CI-LG_Agneby-Tiassa_colu_2012'

analysis_version = "2025.02.13"
min_cohort_size = 15
max_cohort_size = 70
sample_sets = "3.0"
contigs = ["2RL", "3RL", "X"]
cohorts_analysis = "20240924"
dask_scheduler = "single-threaded"

In [2]:
# Parameters
analysis_version = "2025.02.25"
min_cohort_size = 15
max_cohort_size = 100
sample_sets = [
    "3.0",
    "1237-VO-BJ-DJOGBENOU-VMF00050",
    "1237-VO-BJ-DJOGBENOU-VMF00067",
    "1244-VO-GH-YAWSON-VMF00051",
    "1245-VO-CI-CONSTANT-VMF00054",
    "1253-VO-TG-DJOGBENOU-VMF00052",
    "1178-VO-UG-LAWNICZAK-VMF00025",
    "1244-VO-GH-YAWSON-VMF00149",
    "barron-2019",
    "crawford-2016",
    "tennessen-2021",
    "bergey-2019",
    "campos-2021",
    "fontaine-2015-rebuild",
]
sample_query = "taxon in ['gambiae', 'coluzzii', 'arabiensis', 'bissau']"
contigs = ["2RL", "3RL", "X"]
cohorts_analysis = "20240924"
h12_calibration_contig = "3L"
h12_signal_detection_min_delta_aic = 1000
h12_signal_detection_min_stat_max = 0.1
h12_signal_detection_gflanks = [6]
dask_scheduler = "single-threaded"
alerts = ["01", "02", "03", "04", "05", "06", "07", "08", "09"]
cohort_id = "MZ-I_Morrumbene_gamb_2004_Q2"


In [3]:
from pyprojroot import here

root = here()
%run -i {root}/workflow/common/scripts/setup.py
%run -i {root}/workflow/site/scripts/page-setup.py

# N.B., do not add the "remove-output" tag to this cell!!! If you do,
# the bokeh javascript libraries will not get loaded in the generated
# HTML page. The call to output_notebook() injects javascript in the
# cell output which triggers the bokeh javascript libraries to be loaded
# in the page.
output_notebook(hide_banner=True)

# Mozambique / Morrumbene / gambiae / 2004 / Q2

In [4]:
# Load cohorts to find sample query to select samples for this cohort.
cohort = gdf_cohorts.set_index("cohort_id").loc[cohort_id]
cohort

cohort_size                                                          22
country                                                      Mozambique
admin1_iso                                                         MZ-I
admin1_name                                                   Inhambane
admin2_name                                                  Morrumbene
taxon                                                           gambiae
year                                                               2004
quarter                                                               2
cohort_label              Mozambique / Morrumbene / gambiae / 2004 / Q2
sample_query          cohort_admin2_quarter == 'MZ-I_Morrumbene_gamb...
latitude                                                        -23.716
longitude                                                        35.299
h12_window_size                                                 10000.0
country_alpha2                                                  

In [5]:
# Load sample metadata for this cohort.
df_samples = ag3.sample_metadata(
    sample_sets=sample_sets, sample_query=cohort["sample_query"]
)
df_samples

Unnamed: 0,sample_id,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,...,admin1_name,admin1_iso,admin2_name,taxon,cohort_admin1_year,cohort_admin1_month,cohort_admin1_quarter,cohort_admin2_year,cohort_admin2_month,cohort_admin2_quarter
0,BQ0092-C,12/1,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
1,BQ0093-C,12/2,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
2,BQ0094-C,12/4,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
3,BQ0095-C,12/5,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
4,BQ0096-C,12/7,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
5,BQ0097-C,12/8,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
6,BQ0098-C,12/9,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
7,BQ0099-C,13/1,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
8,BQ0100-C,13/2,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2
9,BQ0102-C,13/4,Joao Pinto,Mozambique,Furvela,2004,4,-23.716,35.299,F,...,Inhambane,MZ-I,Morrumbene,gambiae,MZ-I_gamb_2004,MZ-I_gamb_2004_04,MZ-I_gamb_2004_Q2,MZ-I_Morrumbene_gamb_2004,MZ-I_Morrumbene_gamb_2004_04,MZ-I_Morrumbene_gamb_2004_Q2


In [6]:
# Determine collection dates.
df_collection_dates = (
    df_samples.groupby(["year", "month"])
    .size()
    .reset_index()
    .rename(columns={0: "count"})
)
df_collection_dates

Unnamed: 0,year,month,count
0,2004,4,22


In [7]:
# Determine first and last collection months.

min_month = df_collection_dates["month"].min()
max_month = df_collection_dates["month"].max()

if min_month < 0:
    start_month = end_month = None
else:
    start_month = pd.to_datetime(min_month, format="%m").month_name()
    end_month = pd.to_datetime(max_month, format="%m").month_name()

start_month, end_month

('April', 'April')

In [8]:
# Determine unique collection locations.
df_locations = df_samples[["location", "longitude", "latitude"]].drop_duplicates()
df_locations

Unnamed: 0,location,longitude,latitude
0,Furvela,35.299,-23.716


In [9]:
# Extract provenance information about the samples.

df_contributors = df_samples[["release", "sample_set", "contributor"]].drop_duplicates()
df_contributors["study"] = df_contributors.apply(
    lambda v: "Ag1000G" if v["sample_set"].startswith("AG1000G") else "TODO",
    axis="columns",
)
df_contributors["release"] = df_contributors["release"].apply(lambda v: f"Ag{v}")
df_contributors.rename(
    columns={
        "contributor": "Contributor",
        "study": "Study",
        "release": "Data release",
        "sample_set": "Sample set",
    },
    inplace=True,
)
df_contributors.set_index(["Contributor", "Study", "Data release"], inplace=True)
df_contributors

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sample set
Contributor,Study,Data release,Unnamed: 3_level_1
Joao Pinto,Ag1000G,Ag3.0,AG1000G-MZ


In [10]:
# Construct a paragraph with summary information about the samples in this cohort.

n_locations = len(df_locations)

summary_info = f"""This cohort comprises {cohort["cohort_size"]:,} samples from the 
*{cohort["taxon"]}* taxon, collected from {n_locations} locations within the administrative 
division of {cohort["admin2_name"]}, {cohort["admin1_name"]}, {cohort["country"]}."""

if start_month and start_month == end_month:
    summary_info += f""" Collections were made in {start_month} {cohort["year"]}."""
elif start_month:
    summary_info += f""" Collections were made between {start_month} and {end_month} in {cohort["year"]}."""
else:
    summary_info += f""" Collections were made in {cohort["year"]}."""

display(Markdown(summary_info))

This cohort comprises 22 samples from the 
*gambiae* taxon, collected from 1 locations within the administrative 
division of Morrumbene, Inhambane, Mozambique. Collections were made in April 2004.

## Selection scans

In [11]:
# load signals to overlay on H12 plots.

dfs = []
for contig in contigs:
    df = load_cohort_signals(contig=contig, cohort_id=cohort_id)
    dfs.append(df)

df_signals = pd.concat(dfs)

# Add extra columns to help with overlaying signals on plots.
df_signals["bottom"] = 0
df_signals["top"] = 1

df_signals

Unnamed: 0,bottom,top


In [12]:
# load window sizes
with open(here() / h12_calibration_dir / f"{cohort_id}.yaml") as h12_calibration_file:
    h12_calibration_params = yaml.safe_load(h12_calibration_file)
h12_window_size = h12_calibration_params["h12_window_size"]
with open(here() / g123_calibration_dir / f"{cohort_id}.yaml") as g123_calibration_file:
    g123_calibration_params = yaml.safe_load(g123_calibration_file)
g123_window_size = g123_calibration_params["g123_window_size"]

if cohort.taxon == "arabiensis":
    phasing_analysis = "arab"
else:
    phasing_analysis = "gamb_colu"

ihs_window_size = 100


def plot_gwss(
    contig,
    df_signals,
    sizing_mode="stretch_width",
    show=False,
    width=800,
    track_height=150,
    genes_height=90,
):
    sample_query = cohort["sample_query"]

    # h12_palette = list(bkpal.BuPu4[0:1])
    h12_palette = ["black"]

    fig1 = ag3.plot_h12_gwss_track(
        contig=contig,
        window_size=h12_window_size,
        analysis=phasing_analysis,
        sample_sets=sample_sets,
        sample_query=sample_query,
        min_cohort_size=min_cohort_size,
        max_cohort_size=max_cohort_size,
        sizing_mode=sizing_mode,
        show=show,
        width=width,
        height=track_height,
        contig_colors=h12_palette,
    )
    fig1.xaxis.visible = False

    if not df_signals.empty:
        df = df_signals.query("contig == @contig")
        center_xs = [np.array([row.pcenter, row.pcenter]) for idx, row in df.iterrows()]
        center_ys = [np.array([0, 1]) for idx, row in df.iterrows()]
        source = bkmod.ColumnDataSource(
            data={
                "cohort": df.cohort_id,
                "contig": df.contig,
                "score": df.delta_i.astype(int),
                "peak_start": df.span2_pstart,
                "peak_stop": df.span2_pstop,
                "focus_start": df.focus_pstart,
                "focus_stop": df.focus_pstop,
                "center_xs": center_xs,
                "center_ys": center_ys,
                "bottom": df.bottom,
                "top": df.top,
            }
        )
        quad = fig1.quad(
            bottom="bottom",
            top="top",
            left="peak_start",
            right="peak_stop",
            source=source,
            color=signal_span_color,
            alpha=signal_span_alpha,
            line_width=1,
            level="underlay",
        )
        fig1.quad(
            bottom="bottom",
            top="top",
            left="focus_start",
            right="focus_stop",
            source=source,
            color=signal_focus_color,
            alpha=signal_focus_alpha,
            level="underlay",
        )
        glyph = bkmod.MultiLine(
            xs="center_xs",
            ys="center_ys",
            line_color=signal_center_color,
            line_width=2,
            line_alpha=signal_center_alpha,
        )
        fig1.add_glyph(source, glyph)

        hover = bkmod.HoverTool(
            tooltips=[
                ("Cohort", "@cohort"),
                ("Score", "@score"),
                ("Focus", "@focus_start{,} - @focus_stop{,}"),
            ],
            renderers=[quad],
        )

        fig1.add_tools(hover)

    fig2 = ag3.plot_g123_gwss_track(
        contig=contig,
        window_size=g123_window_size,
        sites=phasing_analysis,
        site_mask=phasing_analysis,
        sample_sets=sample_sets,
        sample_query=sample_query,
        min_cohort_size=min_cohort_size,
        max_cohort_size=max_cohort_size,
        sizing_mode=sizing_mode,
        width=width,
        height=track_height,
        show=show,
        title="",
        x_range=fig1.x_range,
    )
    fig2.xaxis.visible = False

    fig3 = ag3.plot_ihs_gwss_track(
        contig=contig,
        window_size=ihs_window_size,
        analysis=phasing_analysis,
        sample_sets=sample_sets,
        sample_query=sample_query,
        min_cohort_size=min_cohort_size,
        max_cohort_size=max_cohort_size,
        sizing_mode=sizing_mode,
        width=width,
        height=track_height,
        show=show,
        title="",
        x_range=fig1.x_range,
    )
    fig3.xaxis.visible = False

    fig4 = ag3.plot_genes(
        region=contig,
        show=show,
        sizing_mode=sizing_mode,
        width=width,
        height=genes_height,
        x_range=fig1.x_range,
    )

    fig = bklay.gridplot(
        [fig1, fig2, fig3, fig4],
        ncols=1,
        toolbar_location="above",
        merge_tools=True,
        sizing_mode=sizing_mode,
    )
    return fig

In [13]:
for contig in contigs:
    display(HTML(f"<h3>Chromosome {contig}</h3>"))

    fig = plot_gwss(
        contig=contig,
        df_signals=df_signals,
    )
    bkplt.show(fig)

HTML(value='<h3>Chromosome 2RL</h3>')

HTML(value='<h3>Chromosome 3RL</h3>')

HTML(value='<h3>Chromosome X</h3>')

## Sampling information

In [14]:
center = cohort[["latitude", "longitude"]].to_list()
m = Map(center=center, zoom=9, basemap=default_basemap)

df = (
    df_samples[["latitude", "longitude", "taxon"]]
    .groupby(["latitude", "longitude", "taxon"])
    .size()
    .to_frame()
    .rename(columns={0: "count"})
    .reset_index()
)

for coh_id, row in df.iterrows():
    lat, long = row[["latitude", "longitude"]]

    if row["taxon"] == "gambiae":
        color = "red"
    elif row["taxon"] == "coluzzii":
        color = "cadetblue"
    elif row["taxon"] == "arabiensis":
        color = "lightgreen"
    else:
        color = "gray"

    marker = Marker(location=(lat, long), draggable=False, opacity=0.7, color=color)
    m.add_layer(marker)
    message2 = HTML()
    message2.value = f"n = {row['count']}"
    marker.popup = message2

display(m)

Map(center=[np.float64(-23.716), np.float64(35.299)], controls=(ZoomControl(options=['position', 'zoom_in_text…

In [15]:
if min_month >= 0:
    # For this cohort we have month data, so show a breakdown of sample
    # numbers by location and month.

    # Construct a pivot table counting samples.
    df_loc_dt = df_samples.pivot_table(
        index=["location", "longitude", "latitude"],
        columns="month",
        values="sample_id",
        aggfunc="count",
        fill_value=0,
    )

    # Tidy up the columns using a multi index.
    df_loc_dt.reset_index(inplace=True)
    cols = pd.MultiIndex.from_tuples(
        [("Location", "Name"), ("Location", "Longitude"), ("Location", "Latitude")]
        + [
            ("Date", pd.to_datetime(x, format="%m").month_name())
            for x in df_loc_dt.columns[3:]
        ],
    )
    df_loc_dt.columns = cols

else:
    # For this cohort we do not have month data, so show a breakdown of sample
    # numbers by location only.

    # Construct a pivot table counting samples.
    df_loc_dt = df_samples.groupby(["location", "longitude", "latitude"]).agg(
        {"sample_id": "count"}
    )

    # Tidy up the columns using a multi index.
    df_loc_dt.reset_index(inplace=True)
    cols = pd.MultiIndex.from_tuples(
        [
            ("Location", "Name"),
            ("Location", "Longitude"),
            ("Location", "Latitude"),
            ("Date", cohort.year),
        ]
    )
    df_loc_dt.columns = cols

# Style the table.
df_loc_dt_styled = (
    df_loc_dt.style.format(precision=3)
    .set_caption("Number of samples collected.")
    .hide()
)
display(df_loc_dt_styled)

Location,Location,Location,Date
Name,Longitude,Latitude,April
Furvela,35.299,-23.716,22


## Diagnostics
### H12 calibration

In [16]:
display(Markdown(f"Selected window size: **{h12_window_size:,}**"))

window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)

ag3.plot_h12_calibration(
    contig=h12_calibration_contig,
    analysis=phasing_analysis,
    sample_sets=sample_sets,
    sample_query=cohort["sample_query"],
    min_cohort_size=min_cohort_size,
    max_cohort_size=max_cohort_size,
    window_sizes=window_sizes,
);

Selected window size: **10,000**

### G123 Calibration

In [17]:
display(Markdown(f"Selected window size: **{g123_window_size:,}**"))

ag3.plot_g123_calibration(
    contig=h12_calibration_contig,
    sites=phasing_analysis,
    site_mask=phasing_analysis,
    sample_sets=sample_sets,
    sample_query=cohort["sample_query"],
    min_cohort_size=min_cohort_size,
    max_cohort_size=max_cohort_size,
    window_sizes=window_sizes,
);

Selected window size: **5,000**

## Data sources

In [18]:
df_sources_style = df_contributors.style.set_caption(
    "MalariaGEN Vector Observatory partners, studies and sample sets contributing data for this cohort."
)
df_sources_style

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sample set
Contributor,Study,Data release,Unnamed: 3_level_1
Joao Pinto,Ag1000G,Ag3.0,AG1000G-MZ
