In [1]:
import os
import sys
# SRC_DIR environment variable should be the absolute path to the 'multicopy-STR-genotyping' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from multicopy_STR_genotyping import file_io

sns.set_context("poster")
%matplotlib inline

In [55]:
df_repeats_gangstr, df_gangstr = file_io.dfs_from_vcf(filename="../../../data/simulated_reads/variants/GangSTR_simulated_trisomy_21.vcf", samples=["simulated_trisomy_21"], vcf_format="GangSTR")
df_gangstr = df_gangstr.assign(copy_number = 2)
df_gangstr = df_gangstr.rename(columns = {"frequencies": "frequencies_gangstr", "genotype": "genotype_gangstr"})
df_gangstr

Unnamed: 0,sample,str_id,copy_number,frequencies_gangstr,genotype_gangstr
0,simulated_trisomy_21,chr21_5011212,2,{3: 56},"[3, 3]"
1,simulated_trisomy_21,chr21_5013077,2,{6: 50},"[6, 6]"
2,simulated_trisomy_21,chr21_5014892,2,{6: 44},"[6, 6]"
3,simulated_trisomy_21,chr21_5016248,2,{6: 41},"[6, 6]"
4,simulated_trisomy_21,chr21_5016789,2,"{9: 11, 10: 26}","[9, 10]"
...,...,...,...,...,...
21493,simulated_trisomy_21,chr21_46688832,2,"{12: 16, 16: 30}","[12, 16]"
21494,simulated_trisomy_21,chr21_46689712,2,{3: 46},"[3, 3]"
21495,simulated_trisomy_21,chr21_46696282,2,{11: 26},"[11, 11]"
21496,simulated_trisomy_21,chr21_46697535,2,{20: 29},"[20, 20]"


In [28]:
df_repeats_hipstr, df_hipstr = file_io.dfs_from_vcf(filename="../../../data/simulated_reads/variants/HipSTR_simulated_trisomy_21.vcf", samples=["tri21"], vcf_format="HipSTR")
df_hipstr["copy_number"] = 2
df_hipstr = df_hipstr.rename(
    columns = {"frequencies": "frequencies_hipstr", "genotype": "genotype_hipstr"}
).assign(
    genotype_hipstr = lambda x: [sorted(i) for i in x["genotype_hipstr"]]
)
df_hipstr

Unnamed: 0,sample,str_id,copy_number,frequencies_hipstr,genotype_hipstr
0,tri21,chr21_5011212,2,{12: 58},"[12, 12]"
1,tri21,chr21_5013077,2,{12: 54},"[12, 12]"
2,tri21,chr21_5014892,2,{12: 46},"[12, 12]"
3,tri21,chr21_5016248,2,{18: 41},"[18, 18]"
4,tri21,chr21_5016789,2,"{9: 12, 10: 33}","[9, 10]"
...,...,...,...,...,...
20888,tri21,chr21_46688832,2,"{24: 17, 32: 33}","[24, 32]"
20889,tri21,chr21_46689712,2,{12: 47},"[12, 12]"
20890,tri21,chr21_46696282,2,{11: 38},"[11, 11]"
20891,tri21,chr21_46697535,2,{20: 41},"[20, 20]"


In [29]:
df_str_calls = (df_hipstr[["str_id", "frequencies_hipstr", "genotype_hipstr"]]
                   .merge(df_gangstr[["str_id", "frequencies_gangstr", "genotype_gangstr"]], on="str_id", how="inner"))
df_str_calls

Unnamed: 0,str_id,frequencies_hipstr,genotype_hipstr,frequencies_gangstr,genotype_gangstr
0,chr21_5011212,{12: 58},"[12, 12]",{3: 56},"[3, 3]"
1,chr21_5013077,{12: 54},"[12, 12]",{6: 50},"[6, 6]"
2,chr21_5014892,{12: 46},"[12, 12]",{6: 44},"[6, 6]"
3,chr21_5016248,{18: 41},"[18, 18]",{6: 41},"[6, 6]"
4,chr21_5016789,"{9: 12, 10: 33}","[9, 10]","{9: 11, 10: 26}","[9, 10]"
...,...,...,...,...,...
20872,chr21_46688832,"{24: 17, 32: 33}","[24, 32]","{12: 16, 16: 30}","[12, 16]"
20873,chr21_46689712,{12: 47},"[12, 12]",{3: 46},"[3, 3]"
20874,chr21_46696282,{11: 38},"[11, 11]",{11: 26},"[11, 11]"
20875,chr21_46697535,{20: 41},"[20, 20]",{20: 29},"[20, 20]"


In [30]:
df_repeats_hg002 = pd.read_csv("../../../data/HG002/variants/HG002_GRCh38_STR_lengths.csv")
df_repeats_hg002 = df_repeats_hg002[df_repeats_hg002["str_id"].isin(df_str_calls["str_id"])].reset_index(drop=True)
df_repeats_hg002

Unnamed: 0,str_id,region_len_ref,region_len_mat,region_len_pat
0,chr21_5354757,12,12.0,12.0
1,chr21_7205148,18,18.0,18.0
2,chr21_8987815,16,16.0,16.0
3,chr21_8987982,10,10.0,10.0
4,chr21_8988058,20,12.0,20.0
...,...,...,...,...
18917,chr21_46688832,32,24.0,32.0
18918,chr21_46689712,12,12.0,12.0
18919,chr21_46696282,11,11.0,11.0
18920,chr21_46697535,20,20.0,20.0


In [31]:
names = ["chr", "start", "end", "period", "unit"]
df_strs_in_segdup = (pd.read_csv("../../../data/hg38_ver13_0boe_mononucleotides_in_segdup.bed", sep="\t", names=names)
                        .query("chr == 'chr21'")
                        .assign(str_id = lambda x: [f"{i}_{j}" for i, j in zip(x["chr"], x["start"] + 1)]))
df_strs_in_segdup

Unnamed: 0,chr,start,end,period,unit,str_id
50287,chr21,5011211,5011223,4,AATC,chr21_5011212
50288,chr21,5013076,5013088,2,CA,chr21_5013077
50289,chr21,5014891,5014903,2,CA,chr21_5014892
50290,chr21,5016247,5016265,3,GAG,chr21_5016248
50291,chr21,5016788,5016798,1,G,chr21_5016789
...,...,...,...,...,...,...
53380,chr21,46688831,46688863,2,AC,chr21_46688832
53381,chr21,46689711,46689723,4,ATTT,chr21_46689712
53382,chr21,46696281,46696292,1,T,chr21_46696282
53383,chr21,46697534,46697554,1,A,chr21_46697535


In [64]:
df_comparison = (df_repeats_hg002
     .merge(df_str_calls, on="str_id", how="inner")
     .merge(df_repeats_gangstr[["str_id", "period", "ref"]], how="inner")
     .assign(region_len_mat = lambda x: x["region_len_mat"].convert_dtypes(int),
             region_len_pat = lambda x: x["region_len_pat"].convert_dtypes(int),
             genotype_gangstr = lambda x: x["genotype_gangstr"] * x["period"],
             genotype_sim = lambda x: [sorted([i, j, k]) for i, j, k in zip(x["region_len_mat"], x["region_len_pat"], x["region_len_ref"])],
             n_distinct_alleles = lambda x: [len(set(i)) for i in x["genotype_sim"]])
     .filter(["str_id", "period", "ref", "unit", "genotype_gangstr", "frequencies_gangstr", "genotype_hipstr", "frequencies_hipstr", "genotype_sim", "n_distinct_alleles"], axis=1))

df_comparison = df_comparison[~df_comparison["str_id"].isin(df_strs_in_segdup["str_id"])].reset_index(drop=True)
df_comparison

Unnamed: 0,str_id,period,ref,genotype_gangstr,frequencies_gangstr,genotype_hipstr,frequencies_hipstr,genotype_sim,n_distinct_alleles
0,chr21_10324665,5,3,"[15, 15]",{3: 17},"[15, 15]",{15: 23},"[15, 15, 15]",1
1,chr21_12974501,1,10,"[10, 10]",{10: 35},"[10, 10]",{10: 53},"[10, 10, 10]",1
2,chr21_13987256,6,2,"[12, 12]",{2: 37},"[12, 12]",{12: 48},"[12, 12, 12]",1
3,chr21_13987386,5,2,"[10, 10]",{2: 37},"[10, 10]",{10: 54},"[10, 10, 10]",1
4,chr21_13998484,1,12,"[12, 13]","{12: 18, 13: 14, 14: 9}","[12, 13]","{-8: 1, 12: 24, 13: 22, 14: 15}","[12, 13, 14]",3
...,...,...,...,...,...,...,...,...,...
17743,chr21_46657138,1,10,"[10, 10]",{10: 45},"[10, 10]",{10: 53},"[10, 10, 10]",1
17744,chr21_46657377,1,14,"[14, 14]",{14: 19},"[14, 14]",{14: 29},"[14, 14, 14]",1
17745,chr21_46666129,1,11,"[11, 11]",{11: 28},"[11, 11]",{11: 37},"[11, 11, 11]",1
17746,chr21_46668752,1,14,"[12, 14]","{12: 20, 14: 22}","[12, 14]","{12: 24, 14: 29}","[12, 14, 14]",2


In [65]:
print(df_comparison["n_distinct_alleles"].value_counts())
df_comparison["n_distinct_alleles"].value_counts(normalize=True)

n_distinct_alleles
1    13113
2     3825
3      810
Name: count, dtype: int64


n_distinct_alleles
1    0.738844
2    0.215517
3    0.045639
Name: proportion, dtype: float64

In [66]:
df_comparison.query("n_distinct_alleles == 2").head()

Unnamed: 0,str_id,period,ref,genotype_gangstr,frequencies_gangstr,genotype_hipstr,frequencies_hipstr,genotype_sim,n_distinct_alleles
21,chr21_14094421,1,10,"[9, 10]","{9: 32, 10: 9}","[9, 10]","{9: 43, 10: 16}","[9, 9, 10]",2
27,chr21_14114178,1,16,"[16, 16]","{15: 7, 16: 27}","[15, 16]","{15: 10, 16: 34}","[15, 16, 16]",2
28,chr21_14124995,1,11,"[10, 11]","{10: 18, 11: 32}","[10, 11]","{10: 22, 11: 42}","[10, 11, 11]",2
37,chr21_14141387,1,13,"[11, 13]","{11: 26, 13: 10}","[11, 13]","{11: 35, 13: 16}","[11, 11, 13]",2
45,chr21_14163093,2,8,"[14, 16]","{7: 33, 8: 15}","[14, 16]","{14: 38, 16: 20}","[14, 14, 16]",2


In [99]:
df_comparison.dropna(subset="frequencies_gangstr").assign(
    distinct_gangstr = lambda x: [len(set(i)) for i in x["genotype_gangstr"]],
    missing_gangstr = lambda x: x["n_distinct_alleles"] - x["distinct_gangstr"],
).filter(["n_distinct_alleles", "distinct_gangstr"]).groupby("n_distinct_alleles", as_index=False).value_counts(normalize=True)

Unnamed: 0,n_distinct_alleles,distinct_gangstr,proportion
0,1,1,0.999542
1,1,2,0.000458
2,2,2,0.924706
3,2,1,0.075294
4,3,2,0.983951
5,3,1,0.016049


In [67]:
df_comparison.query("n_distinct_alleles == 2").assign(
    distinct_gangstr = lambda x: [len(set(i)) for i in x["genotype_gangstr"]]
).query("distinct_gangstr == 1").head(10)

Unnamed: 0,str_id,period,ref,genotype_gangstr,frequencies_gangstr,genotype_hipstr,frequencies_hipstr,genotype_sim,n_distinct_alleles,distinct_gangstr
27,chr21_14114178,1,16,"[16, 16]","{15: 7, 16: 27}","[15, 16]","{15: 10, 16: 34}","[15, 16, 16]",2,1
55,chr21_14191377,1,17,"[17, 17]","{17: 29, 18: 7}","[17, 18]","{17: 37, 18: 13}","[17, 17, 18]",2,1
75,chr21_14235352,2,8,"[16, 16]",{8: 56},"[16, 16]",{16: 57},"[16, 16, 17]",2,1
78,chr21_14243744,4,2,"[8, 8]",{2: 47},"[8, 8]",{8: 58},"[6, 8, 8]",2,1
88,chr21_14268035,2,12,"[24, 24]",{12: 25},"[24, 24]",{24: 43},"[21, 24, 24]",2,1
107,chr21_14303570,5,8,"[40, 40]",{8: 46},"[40, 40]",{40: 46},"[40, 40, 42]",2,1
170,chr21_14433765,4,4,"[16, 16]",{4: 42},"[16, 16]",{16: 43},"[16, 17, 17]",2,1
241,chr21_14554699,1,18,"[19, 19]","{18: 7, 19: 28}","[18, 19]","{1: 1, 13: 1, 18: 13, 19: 37}","[18, 19, 19]",2,1
341,chr21_14760579,4,3,"[12, 12]",{3: 48},"[12, 12]",{12: 50},"[11, 11, 12]",2,1
405,chr21_14903033,3,4,"[12, 12]",{4: 44},"[12, 12]",{12: 44},"[4, 4, 12]",2,1


In [101]:
# chr21_14213292
df_comparison.query("n_distinct_alleles == 3").assign(
    distinct_gangstr = lambda x: [len(set(i)) for i in x["genotype_gangstr"]]
).query("distinct_gangstr == 1")

Unnamed: 0,str_id,period,ref,genotype_gangstr,frequencies_gangstr,genotype_hipstr,frequencies_hipstr,genotype_sim,n_distinct_alleles,distinct_gangstr
937,chr21_15994635,4,5,"[20, 20]",{5: 45},"[20, 20]",{20: 47},"[20, 33, 37]",3,1
2768,chr21_19488881,2,22,"[44, 44]",{22: 32},"[44, 44]",{44: 35},"[35, 37, 44]",3,1
3540,chr21_20982238,2,11,"[22, 22]",{11: 32},"[22, 22]",{22: 45},"[13, 22, 25]",3,1
5994,chr21_25572109,2,11,"[22, 22]",{11: 36},"[22, 22]",{22: 50},"[22, 23, 25]",3,1
7300,chr21_28173037,2,12,"[24, 24]","{12: 27, 13: 7}","[24, 24]","{24: 42, 26: 10}","[24, 26, 27]",3,1
11671,chr21_35446085,2,15,"[30, 30]",{15: 26},"[30, 30]",{30: 40},"[27, 29, 30]",3,1
12415,chr21_36462935,4,2,"[8, 8]",{2: 47},"[8, 8]",{8: 59},"[6, 7, 8]",3,1
13182,chr21_37597069,4,4,"[16, 16]",{4: 47},"[16, 16]",{16: 49},"[16, 19, 21]",3,1
14097,chr21_39337023,5,3,"[15, 15]",{3: 39},"[15, 15]",{15: 39},"[15, 16, 22]",3,1
14530,chr21_40106894,5,2,"[10, 10]",{2: 44},"[10, 10]",{10: 50},"[9, 10, 12]",3,1


In [95]:
df_comparison.dropna(subset="frequencies_hipstr").assign(
    distinct_hipstr = lambda x: [len(set(i)) for i in x["genotype_hipstr"]],
    missing_hipstr = lambda x: x["n_distinct_alleles"] - x["distinct_hipstr"],
).filter(["n_distinct_alleles", "missing_hipstr"]).groupby("n_distinct_alleles", as_index=False).value_counts()

Unnamed: 0,n_distinct_alleles,missing_hipstr,count
0,1,0,13113
1,2,0,3575
2,2,1,248
3,3,1,783
4,3,2,11


In [100]:
df_comparison.dropna(subset="frequencies_hipstr").assign(
    distinct_hipstr = lambda x: [len(set(i)) for i in x["genotype_hipstr"]],
    missing_hipstr = lambda x: x["n_distinct_alleles"] - x["distinct_hipstr"],
).filter(["n_distinct_alleles", "distinct_hipstr"]).groupby("n_distinct_alleles", as_index=False).value_counts(normalize=True)

Unnamed: 0,n_distinct_alleles,distinct_hipstr,proportion
0,1,1,1.0
1,2,2,0.935129
2,2,1,0.064871
3,3,2,0.986146
4,3,1,0.013854


In [83]:
df_comparison.dropna(subset="frequencies_hipstr").query("n_distinct_alleles == 2").assign(
    distinct_hipstr = lambda x: [len(set(i)) for i in x["genotype_hipstr"]]
).query("distinct_hipstr == 1").head(10)

Unnamed: 0,str_id,period,ref,genotype_gangstr,frequencies_gangstr,genotype_hipstr,frequencies_hipstr,genotype_sim,n_distinct_alleles,distinct_hipstr
75,chr21_14235352,2,8,"[16, 16]",{8: 56},"[16, 16]",{16: 57},"[16, 16, 17]",2,1
78,chr21_14243744,4,2,"[8, 8]",{2: 47},"[8, 8]",{8: 58},"[6, 8, 8]",2,1
88,chr21_14268035,2,12,"[24, 24]",{12: 25},"[24, 24]",{24: 43},"[21, 24, 24]",2,1
107,chr21_14303570,5,8,"[40, 40]",{8: 46},"[40, 40]",{40: 46},"[40, 40, 42]",2,1
170,chr21_14433765,4,4,"[16, 16]",{4: 42},"[16, 16]",{16: 43},"[16, 17, 17]",2,1
225,chr21_14534525,1,11,"[11, 12]","{11: 9, 12: 23}","[12, 12]","{-4: 1, 11: 13, 12: 36}","[11, 12, 12]",2,1
334,chr21_14740792,1,11,"[10, 11]","{10: 31, 11: 9}","[10, 10]","{10: 39, 11: 11}","[10, 10, 11]",2,1
341,chr21_14760579,4,3,"[12, 12]",{3: 48},"[12, 12]",{12: 50},"[11, 11, 12]",2,1
405,chr21_14903033,3,4,"[12, 12]",{4: 44},"[12, 12]",{12: 44},"[4, 4, 12]",2,1
409,chr21_14909188,1,24,"[24, 25]","{24: 6, 25: 17}","[25, 25]","{22: 1, 24: 7, 25: 24}","[24, 25, 25]",2,1


In [84]:
df_comparison.dropna(subset="frequencies_hipstr").query("n_distinct_alleles == 3").assign(
    distinct_hipstr = lambda x: [len(set(i)) for i in x["genotype_hipstr"]]
).query("distinct_hipstr == 1")

Unnamed: 0,str_id,period,ref,genotype_gangstr,frequencies_gangstr,genotype_hipstr,frequencies_hipstr,genotype_sim,n_distinct_alleles,distinct_hipstr
937,chr21_15994635,4,5,"[20, 20]",{5: 45},"[20, 20]",{20: 47},"[20, 33, 37]",3,1
2768,chr21_19488881,2,22,"[44, 44]",{22: 32},"[44, 44]",{44: 35},"[35, 37, 44]",3,1
3540,chr21_20982238,2,11,"[22, 22]",{11: 32},"[22, 22]",{22: 45},"[13, 22, 25]",3,1
5994,chr21_25572109,2,11,"[22, 22]",{11: 36},"[22, 22]",{22: 50},"[22, 23, 25]",3,1
7300,chr21_28173037,2,12,"[24, 24]","{12: 27, 13: 7}","[24, 24]","{24: 42, 26: 10}","[24, 26, 27]",3,1
11671,chr21_35446085,2,15,"[30, 30]",{15: 26},"[30, 30]",{30: 40},"[27, 29, 30]",3,1
12415,chr21_36462935,4,2,"[8, 8]",{2: 47},"[8, 8]",{8: 59},"[6, 7, 8]",3,1
13182,chr21_37597069,4,4,"[16, 16]",{4: 47},"[16, 16]",{16: 49},"[16, 19, 21]",3,1
14097,chr21_39337023,5,3,"[15, 15]",{3: 39},"[15, 15]",{15: 39},"[15, 16, 22]",3,1
14530,chr21_40106894,5,2,"[10, 10]",{2: 44},"[10, 10]",{10: 50},"[9, 10, 12]",3,1
