In [1]:
import os
import sys
# SRC_DIR should be the absolute path to the 'multicopy-STR-genotyping' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
import numpy as np
import pandas as pd

from multicopy_STR_genotyping import file_io

In [85]:
# chr, start, end, query depth, mapping quality, REF allele, ALT allele, query name, query start, end and the query orientation
haplotype_maternal = "../../data/HG002/alignments/HG002.maternal.f1_assembly_v2_genbank_sort_var.txt"
haplotype_paternal = "../../data/HG002/alignments/HG002.paternal.f1_assembly_v2_genbank_sort_var.txt"

df_haplo_mat = file_io.load_haplotype_vars(haplotype_maternal)
print(df_haplo_mat.shape)

df_haplo_pat = file_io.load_haplotype_vars(haplotype_paternal)
print(df_haplo_pat.shape)


df_haplo_pat.head()

(3438809, 5)
(3327344, 5)


Unnamed: 0,chr,start,end,REF,ALT
0,chr1,40639,40639,-,tttt
1,chr1,41052,41053,c,t
2,chr1,41217,41218,t,a
3,chr1,41255,41256,c,t
4,chr1,41387,41388,a,g


In [5]:
df_haplo_mat_indel = df_haplo_mat.query("REF == '-' or ALT == '-'")
print(df_haplo_mat_indel.shape)

df_haplo_pat_indel = df_haplo_pat.query("REF == '-' or ALT == '-'")
print(df_haplo_pat_indel.shape)

(686236, 5)
(663895, 5)


In [50]:
df_repeats, df_hg002 = file_io.dfs_from_vcf(file="../../data/HG002/variants/HG002.GRCh38.2x250.vcf", samples=["HG002.GRCh38.2x250"])

df_covered_repeats = pd.read_csv("../../data/HG002/regions/hg38_ver13_union.bed", sep="\t", names=["chr", "start", "end", "period", "unit"])
df_covered_repeats = df_covered_repeats.assign(
            str_id = np.array([f"{chrom}_{start + 1}" for chrom, start in zip(df_covered_repeats.chr, df_covered_repeats.start)])
)

# df_covered_repeats = df_repeats.merge(df_covered_repeats[["str_id", "unit"]], on="str_id", how="inner")/
df_covered_repeats = df_repeats[df_repeats["str_id"].isin(df_covered_repeats["str_id"])]

df_hg002 = df_hg002.merge(
    df_covered_repeats[["str_id", "chr", "start", "end", "period", "unit", "ref"]], on="str_id", how="inner"
).reset_index(drop=True)

df_hg002

Unnamed: 0,sample,str_id,copy_number,frequencies,genotype,chr,start,end,period,unit,ref
0,HG002.GRCh38.2x250,chr1_588068,2,{4: 22},"[4, 4]",chr1,588068,588079,3,CCT,4
1,HG002.GRCh38.2x250,chr1_589245,2,{6: 9},"[6, 6]",chr1,589245,589256,2,TC,6
2,HG002.GRCh38.2x250,chr1_590659,2,{3: 60},"[3, 3]",chr1,590659,590670,4,AAAT,3
3,HG002.GRCh38.2x250,chr1_590969,2,{4: 42},"[4, 4]",chr1,590969,590984,4,AAAC,4
4,HG002.GRCh38.2x250,chr1_594083,2,{4: 21},"[4, 4]",chr1,594083,594094,3,TCC,4
...,...,...,...,...,...,...,...,...,...,...,...
813889,HG002.GRCh38.2x250,chrY_26625608,1,{2: 43},[2],chrY,26625608,26625621,7,TAAGTAT,2
813890,HG002.GRCh38.2x250,chrY_26627699,1,{4: 23},[4],chrY,26627699,26627710,3,GAA,4
813891,HG002.GRCh38.2x250,chrY_26628607,1,{3: 31},[3],chrY,26628607,26628621,5,ATTTT,3
813892,HG002.GRCh38.2x250,chrY_26632688,1,{6: 9},[6],chrY,26632688,26632705,3,TTC,6


In [51]:
mask = []
for idx, (ref, gt) in enumerate(zip(df_hg002["ref"], df_hg002["genotype"])):
    try:
        comp = any([allele != ref for allele in gt])
        mask.append(comp)
    except TypeError:
        mask.append(False)

mask = np.array(mask)

In [109]:
df_hg002[mask].head(25)

Unnamed: 0,sample,str_id,copy_number,frequencies,genotype,chr,start,end,period,unit,ref
16,HG002.GRCh38.2x250,chr1_650384,2,"{18: 20, 20: 13}","[18, 20]",chr1,650384,650423,2,AC,20
36,HG002.GRCh38.2x250,chr1_744867,2,"{7: 6, 8: 3}","[7, 8]",chr1,744867,744882,2,TA,8
65,HG002.GRCh38.2x250,chr1_832737,2,{10: 38},"[10, 10]",chr1,832737,832781,5,GTTTT,9
77,HG002.GRCh38.2x250,chr1_893802,2,"{6: 12, 7: 16}","[6, 7]",chr1,893802,893839,2,AT,19
89,HG002.GRCh38.2x250,chr1_931132,2,"{3: 5, 4: 47}","[4, 4]",chr1,931132,931143,4,CCCT,3
97,HG002.GRCh38.2x250,chr1_950660,2,{5: 71},"[5, 5]",chr1,950660,950671,2,CA,6
119,HG002.GRCh38.2x250,chr1_1005758,2,{1: 61},"[1, 1]",chr1,1005758,1005785,14,GCCCCCGCAGCAGT,2
121,HG002.GRCh38.2x250,chr1_1010482,2,{3: 76},"[3, 3]",chr1,1010482,1010497,4,TTAT,4
199,HG002.GRCh38.2x250,chr1_1293732,2,"{3: 31, 4: 18}","[3, 4]",chr1,1293732,1293791,20,CGCCCCTGCCCTGGAGGCCC,3
291,HG002.GRCh38.2x250,chr1_1557496,2,"{6: 29, 7: 36}","[6, 7]",chr1,1557496,1557509,2,AT,7


In [124]:
idx = 412
locus = df_hg002.iloc[idx]
# flank = 100

# df_haplo_pat.query(f"chr == '{locus.chr}' and start >= {locus.start} and end <= {locus.start}")
df_haplo_pat.query(f"chr == '{locus.chr}' and start <= {locus.end} and {locus.start - 1} <= end")

Unnamed: 0,chr,start,end,REF,ALT
2644,chr1,1904424,1904424,-,aaataaat


In [125]:
df_haplo_mat.query(f"chr == '{locus.chr}' and start <= {locus.end} and {locus.start} <= end")

Unnamed: 0,chr,start,end,REF,ALT
1942,chr1,1904424,1904424,-,aaataaataaat
