In [None]:
import os
import sys
# SRC_DIR environment variable should be the absolute path to the 'ConSTRain-analyses' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from multicopy_STR_genotyping import file_io

sns.set_context("poster")
%matplotlib inline

In [3]:
vcf = "../../../data/HG002/variants/ConSTRain/v0.9.1/HG002.GRCh38.2x250_depth10x_with_mononuc.vcf" # str: path to input vcf file                      
vcf_format = "ConSTRain" # str: which tool was used to genotype
sample = "HG002.GRCh38.2x250_depth10x" # str: name of sample in vcf to analyse
set_cn = False # bool: set CN of loci manually (based on chromosome name)
gt_in_bp = False # bool: does the tool report genotypes in bp (True) or in number of repeat units (False)
strs_covered_by_haplotypes = "../../../data/HG002/regions/hg38_ver13_0boe_mononucleotides_union.bed" # str: path to bed with loci that are covered by the haplotypes provided by Q100
haplotype_str_length = "../../../data/HG002/variants/HG002_GRCh38_STR_lengths.csv" # str: path to csv with STR allele lengths based on Q100 haplotypes
comparison_file_out = None # None or str: path to write csv with comparison of STR allele lengths from vcf to Q100 STR allele lengths

# depth_filter = 0.05 # None or float. Alpha to use for filtering loci with extreme depth values
lower, upper = 1.5, 6.5
filt_segdup = True

In [4]:
def set_cn_manually(df):
    autosomes = [f"chr{i}" for i in range(1, 23)]
    df = df.assign(
        copy_number = lambda x: [2 if i in autosomes else 1 for i in x["chr"]]
    )
    return df

In [5]:
df_repeats, df_hg002 = file_io.dfs_from_vcf(filename=vcf, samples=[sample], vcf_format=vcf_format)
df_hg002 = df_hg002.dropna().assign(
    depth = lambda x: [sum(i.values()) for i in x["frequencies"]],
    depth_norm = lambda x: x["depth"] / x["copy_number"]
).reset_index(drop=True)
df_hg002 = df_hg002.merge(df_repeats[["str_id", "period"]], on="str_id", how="left")
print(df_repeats.shape, df_hg002.shape)


(1733646, 7) (1453440, 8)


In [6]:
if lower and upper:
    print(df_hg002.shape)
    df_hg002 = df_hg002.query(f"depth_norm >= {lower} and depth_norm <= {upper}").reset_index(drop=True)
    print(df_hg002.shape)
    

(1453440, 8)
(1266055, 8)


In [7]:
df_covered_repeats = pd.read_csv(strs_covered_by_haplotypes, sep="\t", names=["chr", "start", "end", "period", "unit"])
df_covered_repeats = df_covered_repeats.assign(
            str_id = np.array([f"{chrom}_{start + 1}" for chrom, start in zip(df_covered_repeats.chr, df_covered_repeats.start)])
)

df_covered_repeats = df_repeats[df_repeats["str_id"].isin(df_covered_repeats["str_id"])]

df_hg002 = df_hg002.merge(
    df_covered_repeats[["str_id", "chr", "start", "end", "unit", "ref"]], on="str_id", how="inner"
)

if set_cn:
    df_hg002 = set_cn_manually(df_hg002)
    
df_hg002 = df_hg002.dropna().reset_index(drop=True)

print(df_covered_repeats.shape, df_hg002.shape)

df_hg002


(1695865, 7) (1249527, 13)


Unnamed: 0,sample,str_id,copy_number,frequencies,genotype,depth,depth_norm,period,chr,start,end,unit,ref
0,HG002.GRCh38.2x250_depth10x,chr1_590659,2,{3: 5},"[3, 3]",5,2.5,4,chr1,590659,590670,AAAT,3
1,HG002.GRCh38.2x250_depth10x,chr1_590969,2,{4: 5},"[4, 4]",5,2.5,4,chr1,590969,590984,AAAC,4
2,HG002.GRCh38.2x250_depth10x,chr1_594083,2,{4: 3},"[4, 4]",3,1.5,3,chr1,594083,594094,TCC,4
3,HG002.GRCh38.2x250_depth10x,chr1_597686,2,{14: 4},"[14, 14]",4,2.0,1,chr1,597686,597699,A,14
4,HG002.GRCh38.2x250_depth10x,chr1_599683,2,{5: 10},"[5, 5]",10,5.0,5,chr1,599683,599707,TTTTG,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249522,HG002.GRCh38.2x250_depth10x,chrY_26619402,1,{12: 2},[12],2,2.0,1,chrY,26619402,26619413,A,12
1249523,HG002.GRCh38.2x250_depth10x,chrY_26619856,1,{10: 2},[10],2,2.0,1,chrY,26619856,26619865,T,10
1249524,HG002.GRCh38.2x250_depth10x,chrY_26622472,1,{2: 4},[2],4,4.0,5,chrY,26622472,26622481,AAATA,2
1249525,HG002.GRCh38.2x250_depth10x,chrY_26625608,1,{2: 2},[2],2,2.0,7,chrY,26625608,26625621,TAAGTAT,2


In [8]:
if filt_segdup:
    df_in_segdup = pd.read_csv("../../../data/hg38_ver13_0boe_mononucleotides_in_segdup.bed", sep="\t", names=["chr", "start", "end", "period", "unit"])
    df_in_segdup = df_in_segdup.assign(str_id = lambda x: [f"{i}_{j + 1}" for i, j in zip(x["chr"], x["start"])])
    
    df_hg002 = df_hg002[~df_hg002["str_id"].isin(df_in_segdup["str_id"])].reset_index(drop=True)
    print(df_hg002.shape)

(1202944, 13)


In [9]:
print(df_hg002.shape)
df_hg002.shape[0] / df_covered_repeats.shape[0] * 100

(1202944, 13)


70.93394816214735

In [10]:
if gt_in_bp:
    df_hg002 = df_hg002.assign(
        illumina_region_len = lambda x: [sorted(gt) for gt in x["genotype"]],
        depth = lambda x: [sum(i.values()) for i in x["frequencies"]],
    )
else:
    df_hg002 = df_hg002.assign(
        illumina_region_len = lambda x: [sorted([k * j for k in i]) for i, j in zip(x["genotype"], x["period"])],
        depth = lambda x: [sum(i.values()) for i in x["frequencies"]],
    )
    
df_hg002

Unnamed: 0,sample,str_id,copy_number,frequencies,genotype,depth,depth_norm,period,chr,start,end,unit,ref,illumina_region_len
0,HG002.GRCh38.2x250_depth10x,chr1_904440,2,{3: 8},"[3, 3]",8,4.0,3,chr1,904440,904448,CTC,3,"[9, 9]"
1,HG002.GRCh38.2x250_depth10x,chr1_904654,2,{4: 7},"[4, 4]",7,3.5,3,chr1,904654,904665,CCT,4,"[12, 12]"
2,HG002.GRCh38.2x250_depth10x,chr1_914949,2,{16: 8},"[16, 16]",8,4.0,1,chr1,914949,914964,A,16,"[16, 16]"
3,HG002.GRCh38.2x250_depth10x,chr1_919012,2,{3: 8},"[3, 3]",8,4.0,4,chr1,919012,919023,GCAG,3,"[12, 12]"
4,HG002.GRCh38.2x250_depth10x,chr1_919291,2,{2: 8},"[2, 2]",8,4.0,5,chr1,919291,919300,CACCC,2,"[10, 10]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202939,HG002.GRCh38.2x250_depth10x,chrY_26474096,1,{3: 3},[3],3,3.0,4,chrY,26474096,26474107,CAAA,3,[12]
1202940,HG002.GRCh38.2x250_depth10x,chrY_26476426,1,{14: 2},[14],2,2.0,1,chrY,26476426,26476439,T,14,[14]
1202941,HG002.GRCh38.2x250_depth10x,chrY_26487819,1,{10: 2},[10],2,2.0,1,chrY,26487819,26487828,A,10,[10]
1202942,HG002.GRCh38.2x250_depth10x,chrY_26487987,1,{10: 5},[10],5,5.0,1,chrY,26487987,26487996,A,10,[10]


In [11]:
df_strs_haplotypes = pd.read_csv(haplotype_str_length).assign(chr = lambda x: [i[0] for i in x["str_id"].str.split("_")])
df_strs_haplotypes

Unnamed: 0,str_id,region_len_ref,region_len_mat,region_len_pat,chr
0,chr1_588068,12,12.0,12.0,chr1
1,chr1_589245,12,12.0,12.0,chr1
2,chr1_590659,12,12.0,12.0,chr1
3,chr1_590969,16,16.0,16.0,chr1
4,chr1_591734,18,20.0,20.0,chr1
...,...,...,...,...,...
1695860,chrY_57188868,11,,11.0,chrY
1695861,chrY_57200838,14,,15.0,chrY
1695862,chrY_57201000,20,,20.0,chrY
1695863,chrY_57202381,23,,23.0,chrY


In [12]:
autosomes = "chr != 'chrX' and chr != 'chrY'"
X = "chr == 'chrX'"
Y = "chr == 'chrY'"

df_strs_haplotypes["haplo_region_len"] = [[] for i in range(len(df_strs_haplotypes))]

df_strs_haplotypes.loc[df_strs_haplotypes.eval(autosomes), "haplo_region_len"] = pd.Series([
    sorted([np.int64(i), np.int64(j)]) for i, j in zip(
        df_strs_haplotypes.loc[df_strs_haplotypes.eval(autosomes), "region_len_mat"], 
        df_strs_haplotypes.loc[df_strs_haplotypes.eval(autosomes), "region_len_pat"],)])

for i, j in zip(
        df_strs_haplotypes.loc[df_strs_haplotypes.eval(X), "haplo_region_len"],
        df_strs_haplotypes.loc[df_strs_haplotypes.eval(X), "region_len_mat"]):
    i.append(np.int64(j))
    
for i, j in zip(
        df_strs_haplotypes.loc[df_strs_haplotypes.eval(Y), "haplo_region_len"],
        df_strs_haplotypes.loc[df_strs_haplotypes.eval(Y), "region_len_pat"]):
    i.append(np.int64(j))
    
df_strs_haplotypes

Unnamed: 0,str_id,region_len_ref,region_len_mat,region_len_pat,chr,haplo_region_len
0,chr1_588068,12,12.0,12.0,chr1,"[12, 12]"
1,chr1_589245,12,12.0,12.0,chr1,"[12, 12]"
2,chr1_590659,12,12.0,12.0,chr1,"[12, 12]"
3,chr1_590969,16,16.0,16.0,chr1,"[16, 16]"
4,chr1_591734,18,20.0,20.0,chr1,"[20, 20]"
...,...,...,...,...,...,...
1695860,chrY_57188868,11,,11.0,chrY,[11]
1695861,chrY_57200838,14,,15.0,chrY,[15]
1695862,chrY_57201000,20,,20.0,chrY,[20]
1695863,chrY_57202381,23,,23.0,chrY,[23]


In [13]:
df_comparison = (
    df_strs_haplotypes[["str_id", "haplo_region_len"]]
        .merge(df_hg002[["str_id", "illumina_region_len", "depth", "copy_number", "period"]], on="str_id", how="inner")
        .assign(
            match = lambda x: x["haplo_region_len"] == x["illumina_region_len"],
        )
)

df_comparison

Unnamed: 0,str_id,haplo_region_len,illumina_region_len,depth,copy_number,period,match
0,chr1_904440,"[9, 9]","[9, 9]",8,2,3,True
1,chr1_904654,"[12, 12]","[12, 12]",7,2,3,True
2,chr1_914949,"[16, 16]","[16, 16]",8,2,1,True
3,chr1_919012,"[12, 12]","[12, 12]",8,2,4,True
4,chr1_919291,"[10, 10]","[10, 10]",8,2,5,True
...,...,...,...,...,...,...,...
1202939,chrY_26474096,[12],[12],3,1,4,True
1202940,chrY_26476426,[14],[14],2,1,1,True
1202941,chrY_26487819,[10],[10],2,1,1,True
1202942,chrY_26487987,[10],[10],5,1,1,True


In [14]:
if comparison_file_out:
    df_comparison.to_csv(comparison_file_out, index=False)

In [16]:
1332322 + 99232

1431554

In [17]:
n_wrong = df_comparison.loc[~df_comparison["match"]].shape[0]
n_total = df_comparison.shape[0]

print(f"n correct: {n_total - n_wrong}, n wrong: {n_wrong}, n total: {n_total}")
print("Overall accuracy:", (1 - (n_wrong / n_total)) * 100)

df_comparison.loc[~df_comparison["match"]]

n correct: 1139731, n wrong: 63213, n total: 1202944
Overall accuracy: 94.74514191849329


Unnamed: 0,str_id,haplo_region_len,illumina_region_len,depth,copy_number,period,match
54,chr1_1020848,"[13, 14]","[13, 13]",3,2,1,False
72,chr1_1081643,"[11, 12]","[10, 10]",3,2,5,False
153,chr1_1293732,"[60, 80]","[60, 60]",5,2,20,False
189,chr1_1363439,"[13, 13]","[15, 15]",7,2,1,False
192,chr1_1369036,"[16, 16]","[16, 17]",7,2,1,False
...,...,...,...,...,...,...,...
1200369,chrY_7263346,[15],[14],3,1,1,False
1201340,chrY_14296235,[38],[36],2,1,2,False
1201368,chrY_14409113,[15],[16],6,1,1,False
1201513,chrY_14892739,[13],[14],2,1,1,False


In [None]:
df_plot = pd.DataFrame(
    np.concatenate([np.repeat([1, 2, 3, 4, 5, 6], 7).reshape(-1, 1), np.tile([1, 5, 5, 10, 10, 15, 15, 20, 20, 25, 25, 30, 30, 1e6], 6).reshape(-1, 2)], axis=1),
    columns=["period", "depth_lower", "depth_upper"]
)

accuracy = []
n = []
for row in df_plot.to_dict(orient="records"):
    selected_loci = (
        df_comparison
            .assign(depth = lambda x: x["depth"] / x["copy_number"])
            .query(f"depth >= {row['depth_lower']} and depth < {row['depth_upper']} and period == {row['period']}")["str_id"]
    )
    df_comparison_filt = df_comparison.loc[df_comparison["str_id"].isin(selected_loci)]
    current_n = df_comparison_filt.shape[0]
    if current_n == 0:
        n.append(np.nan)
        accuracy.append(np.nan)
    else:
        n.append(current_n)
        accuracy.append((1 - (df_comparison_filt.loc[~df_comparison_filt["match"]].shape[0] / df_comparison_filt.shape[0])) * 100)

df_plot["n"] = n
df_plot["accuracy"] = accuracy
df_plot["depth"] = np.tile(["1-5", "5-10", "10-15", "15-20", "20-25", "25-30", "30-"], 6)

In [None]:
for name, data in df_plot.groupby("depth"):
    print(name, np.average(data["accuracy"], weights=data["n"]), data["n"].sum())
print()
for name, data in df_plot.groupby("period"):
    print(name, np.average(data["accuracy"], weights=data["n"]), data["n"].sum())



In [None]:
fig = plt.figure(figsize=(10, 8))

ax = sns.barplot(
    df_plot,
    x="period",
    y="accuracy",
    hue="depth",
    palette="colorblind"
)

ax.set(
    xlabel="Period",
    ylabel = "Accuracy"
)

_ = ax.get_legend().set(
    title="Depth / CN",
    bbox_to_anchor = (1, 1)
)

In [None]:
from scipy import ndimage

df_plot_density = (
    df_comparison
        .query("period >= 1 and period <= 6")
        .assign(depth = lambda x: x["depth"] / x["copy_number"])[["match", "period", "depth"]]
)

df_plot_smooth = (
    df_plot_density        
        .drop("period", axis=1)
        .groupby(["depth"], observed=True, as_index=False).agg(
            accuracy = ("match", lambda x: x.sum() / len(x)), 
            n_obs=("match", "count"))
        # .query(f"n_obs >= {min_obs}")
        
)

smooth = []
smooth = ndimage.gaussian_filter1d(df_plot_smooth["accuracy"], 3, mode="reflect")
df_plot_smooth = df_plot_smooth.assign(smooth = smooth)


df_plot_smooth_period = (
    df_plot_density
        .groupby(["period", "depth"], observed=True, as_index=False).agg(
            accuracy = ("match", lambda x: x.sum() / len(x)), 
            n_obs=("match", "count"))
        # .query(f"n_obs >= {min_obs}")
        
)

smooth = []
for group, data in df_plot_smooth_period.groupby(["period"]):
    current_smooth = ndimage.gaussian_filter1d(data["accuracy"], 3, mode="reflect")
    smooth.append(current_smooth)
df_plot_smooth_period = df_plot_smooth_period.assign(smooth = np.concatenate(smooth))

df_plot_smooth_period

In [None]:
fig = plt.figure(dpi=125)

ax = sns.lineplot(
    df_plot_smooth.assign(tmp="ConSTRain"), # hack to add a legend
    x = "depth",
    y = "smooth",
    hue="tmp",
    palette="colorblind",
    legend=False
)

ax = sns.scatterplot(
    df_plot_smooth,
    x = "depth",
    y = "accuracy",
    color=sns.color_palette("colorblind")[0],
    s=25,
    ax = ax,
)

ax.set(
    xlabel = "Sequencing depth / CN",
    ylabel = "Accuracy",
    ylim = (0, 1.05),
    xlim = (0, 20)
)
ax.yaxis.label.set_color(color=sns.color_palette("colorblind")[0])

ax2 = ax.twinx()
ax2 = sns.histplot(
    df_plot_density,
    x = "depth",
    discrete=True,
    stat="proportion",
    color="grey",
    ax=ax2,
)
ax2.set(
    ylabel="Proportion of STR loci",
    # ylim=(0, 0.07),
)
ax2.yaxis.label.set_color(color="grey")

plt.show()

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 20))

for period, ax in zip(range(1, 7), axes.ravel()):
    df_subplot = (df_plot_smooth_period
                  .query(f"period == {period}"))

    ax = sns.lineplot(
        df_subplot,
        x = "depth",
        y = "smooth",
        color=sns.color_palette("colorblind")[0],
        linewidth=5,
        ax = ax,
    )

    ax = sns.scatterplot(
        df_subplot,
        x = "depth",
        y = "accuracy",
        color=sns.color_palette("colorblind")[0],
        s=25,
        ax = ax,
    )

    ax.set(
        title = f"Period {period}",
        xlabel = "Sequencing depth / CN",
        ylabel = "Accuracy",
        ylim = (0, 1.05),
        xlim = (0, 60)
    )
    ax.yaxis.label.set_color(color=sns.color_palette("colorblind")[0])
    
    ax2 = ax.twinx()
    ax2 = sns.histplot(
        df_plot_density.query(f"period == {period}"),
        x = "depth",
        discrete=True,
        stat="proportion",
        color="grey",
        ax=ax2,
    )
    
    ax2.set(
        ylabel="Proportion of STR loci",
        ylim=(0, 0.08),
    )
    ax2.yaxis.label.set_color(color="grey")

plt.tight_layout()