In [1]:
import pandas as pd
import seaborn as sns

# Replace with path to merged BED file
#file = "/master/abagwell/variant-analysis/results/rhesus/coverage/mosdepth/WGS_rhesus_1dup.merged.bed"
#file = "/master/abagwell/variant-analysis/results/rhesus/coverage/mosdepth/WGS_rhesus.merged.bed"
#file = "/master/abagwell/variant-analysis/results/rhesus/coverage/mosdepth/RPL.merged.bed"
file = "/master/abagwell/variant-analysis/results/rhesus/coverage/mosdepth/SNPRC_WES.merged.bed"
#file = "/master/abagwell/variant-analysis/results/Caribbean/coverage/mosdepth/CPRC.merged.bed"
#file = "/master/abagwell/variant-analysis/results/rhesus/coverage/mosdepth/SNPRC_WGS_updated.merged.bed"

df = pd.read_table(file, names=["sample", "chrom", "start", "end", "avg_coverage"])
df = df[df["chrom"].isin([str(i) for i in list(range(1,25)) + ["X", "Y", "MT"]])]  #Keep only numbered chromosomes
df = df[df["end"] - df["start"] == 5_000_000]  # Remove the last interval (since it is not a full)

not_normalized_df = df.copy()

pos_coverage = df[["chrom", "start", "end", "avg_coverage"]].groupby(["chrom", "start", "end"]).mean()
pos_coverage_dict = pos_coverage.to_dict()["avg_coverage"]
#df.loc[df["sample"] == "31310"].loc[df["chrom"] != "X"].loc[df["chrom"] != "Y"]["avg_coverage"].mean()

In [None]:
# Option 1: Seaborn implementation
# This is a much simpler implementation, but is difficult to view when there are many samples,
# is not modified for normalized coverage, and difficult to compare across chromosome.

sns.set_theme()
sns.relplot(data=df, hue="sample", row="chrom", x="start", y="avg_coverage", kind="line", height=5, aspect=2)

In [None]:
# Option 2: Bokeh implementation
# More complex implementation but normalizes coverage, can be zoomed and expanded interactively, and displays sample labels when hovering.

import itertools

from bokeh.io import curdoc, output_file, output_notebook
from bokeh.models import ColumnDataSource, WheelZoomTool, HoverTool, PanTool, ResetTool, Span
#from bokeh.palettes import plasma as palette
from bokeh.palettes import Category20 as palette
palette = palette[20]
from bokeh.plotting import figure, show
import numpy as np

samples = df["sample"].unique()
males = list(df.loc[df["chrom"] == "Y"].loc[df["avg_coverage"] < 5]["sample"])
females = list(df.loc[df["chrom"] == "Y"].loc[df["avg_coverage"] >= 5]["sample"])
chromosomes = [str(i) for i in list(range(1,21))] + ["X", "Y", "MT"]


# Normalize samples
for sample in samples:
    #avg = df.loc[df["sample"] == sample]["avg_coverage"].mean()
    avg = df.loc[df["sample"] == sample].loc[df["chrom"] != "X"].loc[df["chrom"] != "Y"]["avg_coverage"].mean()
    #print(df.loc[df["sample"] == sample]["avg_coverage"])
    #df["normalized_coverage"] = df.loc[:, "avg_coverage"]
    df.loc[df["sample"] == sample, ["avg_coverage"]] /= avg

    # min = df["avg_coverage"].min()
    # range = df["avg_coverage"].max() - min
    # df.loc[df["sample"] == sample, ["avg_coverage"]] -= min
    # df.loc[df["sample"] == sample, ["avg_coverage"]] /= range

    
    #df.loc[df["sample"] == sample, ["avg_coverage"]] = np.log2(df.loc[df["sample"] == sample, ["avg_coverage"]]) # Normalize from 0 to 1

# Normalize by 

# # Normalize by position
# for index, row in df.iterrows():
#     pos_cov = pos_coverage_dict[(row["chrom"], row["start"], row["end"])]
#     row["avg_coverage"] = pos_cov /row["avg_coverage"]



# Concat chromosomes
end = 0
chromosome_endpoints = []
for chrom in chromosomes:
    df.loc[df["chrom"] == chrom, ["start"]] += end
    end = df.loc[df["chrom"] == chrom]["start"].max()
    if chrom != "MT":  # Remove last endpoint
        chromosome_endpoints.append(end)

#df.loc[df["sample"] == sample].loc[df["chrom"] != "X"].loc[df["chrom"] != "Y"]


In [None]:
cov_descriptor = "Normalized Coverage"

tooltips = [
    ("sample", "@sample"),
    ("Window", "@start - @end"),
    ("Chrom", "@chrom"),
    (cov_descriptor, "@avg_coverage")
]

# Create figure
p = figure(title=f"{cov_descriptor} of RPL Rhesus Macaques", x_axis_label='Genome Postition', y_axis_label=cov_descriptor,
    tools=[WheelZoomTool(), PanTool(), HoverTool(line_policy="nearest"), ResetTool()], tooltips=tooltips)

# Add lines
colors = itertools.cycle(palette)
for sample in samples:
    #if sample not in ["WGS17534", "WGS33938", "WGS33963"]:
    color = next(colors)
    # if sample in males:
    #     color = "teal"
    # else:
    #     color = "orange"
    subset = df[df["sample"] == sample]
    source = ColumnDataSource(subset)
    p.line(x="start", y="avg_coverage", legend_label=sample, color=color, alpha=0.3, hover_line_color="black", hover_alpha=1, line_width=1, source=source)

# Modify properties
p.legend.label_text_font_size = "8px"
p.legend.title = "Samples"
p.legend.visible = False  # Hides legend
p.sizing_mode = "stretch_width"
p.height = 700

# Add spans to separate chromosomes
for endpoint in chromosome_endpoints:
    p.add_layout(Span(location=endpoint, dimension="height", line_width=3))

In [None]:
# Output figure
curdoc().theme = "light_minimal"
output_notebook()
#output_file("WGS_coverage_of_rhesus_macaques_color_outliers_included.html")
output_file(f"/master/abagwell/figures/coverage/{cov_descriptor}_of_RPL_rhesus_macaques_color_outliers_included.html")
show(p)

In [None]:
# Output figure
curdoc().theme = "light_minimal"
#output_notebook()
output_file("WGS_coverage_of_rhesus_macaques_outliers_removed.html")
show(p)

In [None]:
## Option 3: Altair implementation (best)
import altair as alt
import polars as pl

pl_df = pl.from_pandas(df).with_columns(
    pl.col("end").sub(5_000_000).alias("original_start"),
).to_arrow().to_pandas()

alt.data_transformers.disable_max_rows()
base = alt.Chart(pl_df)
cov_descriptor = "Normalized Coverage"

#selector = alt.selection_point(fields=["sample"])
selector = alt.selection_interval()
#color=alt.condition(selector, "sample", alt.value("lightgray"))

chr_lengths = df[["chrom", "end"]].groupby("chrom", sort=False).max("end")
#chr_lengths = largest_seq.select("chm", "epos").groupby("chm").max().sort("chm")

# Cumsum offsets of chromosomal end positions
# offsets = []
# offset = 0
# for chm, epos in chr_lengths.iterrows():
#     epos = epos[0]
#     offsets.append((chm, offset))
#     offset += epos
# offsets = dict(offsets)
offsets = [pos for pos in chromosome_endpoints]

lineplot = base.mark_line().encode(
    alt.X("start", title="Genome Position", axis=alt.Axis(grid=False)),
    alt.Y('mean(avg_coverage)', title=cov_descriptor),  #("count()"),   #, scale=alt.Scale(domain=[0,35])),
    color=alt.condition(selector, alt.Color("sample:N").legend(None).scale(scheme="set2"), alt.value("lightgray")),
    opacity = alt.condition(selector, alt.value(1.0), alt.value(0.0)),
    #color=alt.Color("sample:N").legend(None).scale(scheme="set2"),
    tooltip=[
        alt.Tooltip("sample", title="Sample"),
        #alt.Tooltip("end" - 5_000_000, title="Start"),
        alt.Tooltip("chrom", title="Chr"),
        alt.Tooltip("original_start", title="Start"),
        alt.Tooltip("end", title="End"),
        alt.Tooltip("avg_coverage", title=cov_descriptor)
    ]
).properties(
    #width = 1200, # Original
    width = 4000,
    #title = f"{cov_descriptor} of SNPRC Rhesus Macaques",
    title = f"{cov_descriptor} of WGS Rhesus Macaques",
).interactive()

chromosome_boundaries = alt.Chart(pd.DataFrame({'x': offsets})).mark_rule(color="gray", strokeWidth=1).encode(x='x')

selectable_samples = alt.Chart(not_normalized_df).mark_bar(width=7).encode(
    alt.X("sample", title='Samples'),#.sort("-y"),
    alt.Y("mean(avg_coverage)", title='Coverage'),
    color=alt.value("#93C572"),
    # tooltip=[
    #     alt.Tooltip("count()"),
    # ]
).properties(
    height = 50,  # WGS_rhesus
    #width = 700,  # RPL
    #width = 1000,  # CPRC
    width = 4000, # WES
    #width = 1400, # mew WGS
    title = f"Samples by Coverage (selectable)",
).add_params(
    selector
)

chromosome_boundaries + lineplot & selectable_samples
(chromosome_boundaries + lineplot & selectable_samples).save("/master/abagwell/figures/normalized_coverage_SNPRC_WES_altair2.html")
#(chromosome_boundaries + lineplot & selectable_samples).save("/master/abagwell/figures/normalized_coverage_SNPRC_WGS_altair.html")
#(chromosome_boundaries + lineplot & selectable_samples).save("/master/abagwell/figures/normalized_coverage_CPRC_altair.html")
#(chromosome_boundaries + lineplot & selectable_samples).save("/master/abagwell/figures/normalized_coverage_RPL_altair.html")

In [None]:
# Altair plot with chromosomes split
# Under development

plot_list = []
for idx, chrom in enumerate(chromosomes):
    lineplot = alt.Chart(pl_df).mark_line().encode(
    alt.X("start", title="Genome Position", axis=alt.Axis(grid=False)),
    alt.Y('mean(avg_coverage)', title=cov_descriptor),  #("count()"),   #, scale=alt.Scale(domain=[0,35])),
    color=alt.condition(selector, alt.Color("sample:N").legend(None).scale(scheme="set2"), alt.value("lightgray")),
    opacity = alt.condition(selector, alt.value(1.0), alt.value(0.0)),
    #color=alt.Color("sample:N").legend(None).scale(scheme="set2"),
    tooltip=[
        alt.Tooltip("sample", title="Sample"),
        #alt.Tooltip("end" - 5_000_000, title="Start"),
        alt.Tooltip("chrom", title="Chr"),
        alt.Tooltip("original_start", title="Start"),
        alt.Tooltip("end", title="End"),
        alt.Tooltip("avg_coverage", title=cov_descriptor)
    ]
    ).properties(
        width = 1200,
        #title = f"{cov_descriptor} of SNPRC Rhesus Macaques",
        title = f"{cov_descriptor} of WES Rhesus Macaques",
    ).interactive()

selectable_samples = alt.Chart(not_normalized_df).mark_bar(width=7).encode(
    alt.X("sample", title='Samples').sort("-y"),
    alt.Y("mean(avg_coverage)", title='Coverage'),
    color=alt.value("#93C572"),
    # tooltip=[
    #     alt.Tooltip("count()"),
    # ]
).properties(
    height = 50,  # WGS_rhesus
    width = 700,  # RPL
    #width = 1000,  # CPRC
    title = f"Samples by Coverage (selectable)"
).add_params(
    selector
)


In [None]:
# Boxplot of chrX
# Also removing specific samples that had different pattern of coverage as well as chrY

# Place sample to remove here
deviant_samples = []

pl_df = pl.from_pandas(df).groupby(
    pl.col("sample"),
    pl.col("chrom"),
).agg(
    pl.col("avg_coverage").mean()
).sort("chrom").filter(
    ~pl.col("sample").is_in(deviant_samples)
)
# .filter(
#     pl.col("chrom") != "Y"
# )

In [None]:
chrom_order = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "X", "Y"]
alt.Chart(pl_df.to_arrow().to_pandas()).mark_boxplot().encode(
    alt.X("chrom", title="Chromosome", sort=chrom_order),
    alt.Y("avg_coverage", title="Normalized Coverage").scale(zero=False),
    tooltip=[
        alt.Tooltip("sample", title="Sample"),
    ],
    color=alt.Color("chrom", sort=chrom_order, legend=None).scale(scheme="dark2") #.scale(range=['#1f77b4', 'orange']),
).properties(
    #width=40,
    #title=["chrX"],
    title="Normalized Coverage of Rhesus RPL",
)#.save("/master/abagwell/figures/WES_boxplot.svg")
#.save("/master/abagwell/figures/WES_boxplot_excluding5_and_chrY.html")