In [1]:
import os
import sys
# SRC_DIR environment variable should be the absolute path to the 'multicopy-STR-genotyping' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from multicopy_STR_genotyping import file_io

sns.set_context("poster")
%matplotlib inline

In [11]:
def load_str_calls(str_path: str, segdup_ids) -> pd.DataFrame:
    df_strs = pd.read_csv(str_path)
    df_strs = df_strs[~df_strs["str_id"].isin(segdup_ids)].reset_index(drop=True)
    return df_strs

In [12]:
df_in_segdup = pd.read_csv("../../data/hg38_ver13_0boe_mononucleotides_in_segdup.bed", sep="\t", names=["chr", "start", "end", "period", "unit"])
df_in_segdup = df_in_segdup.assign(str_id = lambda x: [f"{i}_{j + 1}" for i, j in zip(x["chr"], x["start"])])

df_constrain = load_str_calls("../../results/HG002/2024-08-21_hg002_ConSTRain0.8.1.csv", df_in_segdup["str_id"]).rename(columns={"match": "constrain_match"})
df_gangstr = load_str_calls("../../results/HG002/2024-03-19_hg002_str_length_calls_gangstr.csv", df_in_segdup["str_id"]).rename(columns={"match": "gangstr_match"}).drop(["haplo_region_len", "softmatch"], axis=1)
df_hipstr = load_str_calls("../../results/HG002/2024-07-09_hg002_str_length_calls_hipstr.csv", df_in_segdup["str_id"]).rename(columns={"match": "hipstr_match"}).drop(["haplo_region_len", "depth", "copy_number", "period"], axis=1)

print(df_constrain.shape, df_gangstr.shape, df_hipstr.shape)

(1603475, 7) (1588412, 3) (1176694, 3)


In [13]:
df_str_calls = df_constrain.merge(df_gangstr, on="str_id", how="inner").merge(df_hipstr, on="str_id", how="inner")
df_str_calls

Unnamed: 0,str_id,haplo_region_len,illumina_region_len,depth,copy_number,period,constrain_match,gangstr_region_len,gangstr_match,hipstr_region_len,hipstr_match
0,chr1_902774,"[14, 14]","[14, 14]",59,2,1,True,"[14, 14]",True,"[14, 14]",True
1,chr1_904440,"[9, 9]","[9, 9]",55,2,3,True,"[9, 9]",True,"[9, 9]",True
2,chr1_904654,"[12, 12]","[12, 12]",48,2,3,True,"[12, 12]",True,"[12, 12]",True
3,chr1_907237,"[14, 14]","[14, 14]",49,2,1,True,"[14, 14]",True,"[14, 14]",True
4,chr1_919012,"[12, 12]","[12, 12]",51,2,4,True,"[12, 12]",True,"[12, 12]",True
...,...,...,...,...,...,...,...,...,...,...,...
1172901,chrY_26487819,[10],[10],31,1,1,True,[10],True,[10],True
1172902,chrY_26487987,[10],[10],38,1,1,True,[10],True,[10],True
1172903,chrY_26489202,[12],[12],33,1,4,True,[12],True,[12],True
1172904,chrY_26489935,[12],[12],23,1,3,True,[12],True,[12],True


In [14]:
(df_str_calls[["constrain_match", "gangstr_match", "hipstr_match"]]
    .melt(var_name="method", value_name="accuracy")
    .groupby("method")
    .agg(accuracy = ("accuracy", lambda x: x.sum() / len(x))))

Unnamed: 0_level_0,accuracy
method,Unnamed: 1_level_1
constrain_match,0.988643
gangstr_match,0.987878
hipstr_match,0.977707


In [None]:
df_overview = (df_str_calls[["period", "constrain_match", "gangstr_match", "hipstr_match"]]
               .groupby("period", as_index=False)
               .agg(ConSTRain = ("constrain_match", lambda x: x.sum() / len(x)), 
                    GangSTR = ("gangstr_match", lambda x: x.sum() / len(x)),
                    HipSTR = ("hipstr_match", lambda x: x.sum() / len(x)),)
               .melt(id_vars="period", var_name="approach", value_name="accuracy"))

fig = plt.figure(figsize=(15, 8))

ax = sns.barplot(
    data=df_overview,
    x="period",
    y="accuracy",
    hue="approach",
    palette="colorblind",
)

ax.set(
    xlabel="Period",
    # xlim=(None, 6),
    ylabel="Accuracy",    
)
ax.get_legend().set(bbox_to_anchor=(1,1), title="")

plt.show()

In [None]:
df_plot = pd.DataFrame(
    np.concatenate([np.repeat([1, 2, 3, 4, 5, 6], 7).reshape(-1, 1), np.tile([1, 5, 5, 10, 10, 15, 15, 20, 20, 25, 25, 30, 30, 1e6], 6).reshape(-1, 2)], axis=1),
    columns=["period", "depth_lower", "depth_upper"]
)

constrain_accuracy = []
gangstr_accuracy = []
hipstr_accuracy = []
n = []
for row in df_plot.to_dict(orient="records"):
    selected_loci = (
        df_str_calls
            .assign(depth = lambda x: x["depth"] / x["copy_number"])
            .query(f"depth >= {row['depth_lower']} and depth < {row['depth_upper']} and period == {row['period']}")["str_id"]
    )
    df_str_calls_filt = df_str_calls.loc[df_str_calls["str_id"].isin(selected_loci)]
    n.append(df_str_calls_filt.shape[0])
    constrain_accuracy.append((1 - (df_str_calls_filt.loc[~df_str_calls_filt["constrain_match"]].shape[0] / df_str_calls_filt.shape[0])) * 100)
    gangstr_accuracy.append((1 - (df_str_calls_filt.loc[~df_str_calls_filt["gangstr_match"]].shape[0] / df_str_calls_filt.shape[0])) * 100)
    hipstr_accuracy.append((1 - (df_str_calls_filt.loc[~df_str_calls_filt["hipstr_match"]].shape[0] / df_str_calls_filt.shape[0])) * 100)

df_plot["n"] = n
df_plot["constrain_accuracy"] = constrain_accuracy
df_plot["gangstr_accuracy"] = gangstr_accuracy
df_plot["hipstr_accuracy"] = hipstr_accuracy
df_plot["depth"] = np.tile(["1-5", "5-10", "10-15", "15-20", "20-25", "25-30", "30-"], 6)

df_plot

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(30, 20))

for period, ax in zip(range(1, 7), axes.ravel()):
    df_subplot = (df_plot
                  .rename(columns = {"constrain_accuracy": "ConSTRain", "gangstr_accuracy": "GangSTR", "hipstr_accuracy": "HipSTR"})
                  .query(f"period == {period}")[["ConSTRain", "GangSTR", "HipSTR", "depth"]]
                  .melt(id_vars="depth",var_name="approach", value_name="accuracy"))
    
    ax = sns.barplot(
        data=df_subplot,
        x="depth",
        y="accuracy",
        hue="approach",
        palette="colorblind",
        ax = ax,
        legend = period == 2
    )
    if period == 2:
        # ax.get_legend().set(bbox_to_anchor = (1,1), title="")
        ax.get_legend().set(title="")
    ax.set(
        title = f"Period: {period}",
        xlabel = "Depth / CN",
        ylabel = "Accuracy"
    )

plt.tight_layout()

In [None]:
df_plot_density = (
    df_str_calls
        .rename(columns = {"constrain_match": "ConSTRain", "gangstr_match": "GangSTR", "hipstr_match": "HipSTR"})
        .query("period >= 1 and period <= 6")
        .assign(depth = lambda x: x["depth"] / x["copy_number"])[["ConSTRain", "GangSTR", "HipSTR", "period", "depth"]]
)

df_plot_density

In [None]:
from scipy import ndimage

df_plot_smooth = (
    df_plot_density        
        .drop("period", axis=1)
        .melt(id_vars="depth",var_name="approach", value_name="match")
        .groupby(["approach", "depth"], observed=True, as_index=False).agg(
            accuracy = ("match", lambda x: x.sum() / len(x)), 
            n_obs=("match", "count"))
)

# df_plot_smooth["smooth"] = np.nan
smooth = []
for name, data in df_plot_smooth.groupby("approach"):
    inner_smooth = ndimage.gaussian_filter1d(data["accuracy"], 3, mode="reflect")
    smooth.append(inner_smooth)
df_plot_smooth["accuracy"] = np.concatenate(smooth)
df_plot_smooth

In [None]:
fig = plt.figure(figsize=(12, 12))

ax = sns.lineplot(
    df_plot_smooth,
    x = "depth",
    y = "accuracy",
    hue = "approach",
    palette="colorblind"
)

ax.set(
    xlabel = "Sequencing depth / CN",
    ylabel = "Accuracy",
    ylim = (0, 1.05),
    xlim = (0, 60)
)

ax2 = ax.twinx()
ax2 = sns.histplot(
    df_plot_density,
    x = "depth",
    discrete=True,
    stat="proportion",
    color="grey",
    ax=ax2,
)
ax2.set(
    ylabel="Proportion of STR loci",
    ylim=(0, 0.07),
)

# ax.get_legend().set(bbox_to_anchor=(1.25, 1), loc="upper left", title="")
ax.get_legend().set(loc="center right", title="")

plt.show()

In [None]:
df_plot_density

In [None]:
from scipy import ndimage

df_plot_smooth_period = (
    df_plot_density        
        .melt(id_vars=["period", "depth"],var_name="approach", value_name="match")
        .groupby(["approach", "period", "depth"], observed=True, as_index=False).agg(
            accuracy = ("match", lambda x: x.sum() / len(x)), 
            n_obs=("match", "count"))
)

smooth = []
for name, data in df_plot_smooth_period.groupby("approach"):
    inner_smooth = ndimage.gaussian_filter1d(data["accuracy"], 3, mode="reflect")
    smooth.append(inner_smooth)
df_plot_smooth_period["accuracy"] = np.concatenate(smooth)
df_plot_smooth_period

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 20))

for period, ax in zip(range(1, 7), axes.ravel()):
    df_subplot = (df_plot_smooth_period
                  .query(f"period == {period}"))

    add_legend = period == 4
    ax = sns.lineplot(
        df_subplot,
        x = "depth",
        y = "accuracy",
        hue = "approach",
        palette="colorblind",
        linewidth=5,
        ax = ax,
        legend= add_legend
    )

    if add_legend:
        ax.get_legend().set(loc="center left", title=None)
    
    ax.set(
        title = f"Period {period}",
        xlabel = "Sequencing depth / CN",
        ylabel = "Accuracy",
        ylim = (0, 1.05),
        xlim = (0, 55)
    )
    
    ax2 = ax.twinx()
    ax2 = sns.histplot(
        df_plot_density.query(f"period == {period}"),
        x = "depth",
        discrete=True,
        stat="proportion",
        color="grey",
        ax=ax2,
    )
    
    ax2.set(
        ylabel="Proportion of STR loci",
        # ylim=(0, ax2.get_ylim()[1] + 0.01),
        ylim=(0, 0.07),
    )

plt.tight_layout()