In [1]:
import os
import sys
# SRC_DIR environment variable should be the absolute path to the 'multicopy-STR-genotyping' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from multicopy_STR_genotyping import file_io

sns.set_context("poster")
%matplotlib inline

In [3]:
# chr, start, end, query depth, mapping quality, REF allele, ALT allele, query name, query start, end and the query orientation
haplotype_maternal = "../../data/HG002/alignments/HG002.maternal.f1_assembly_v2_genbank_sort_var.txt"
haplotype_paternal = "../../data/HG002/alignments/HG002.paternal.f1_assembly_v2_genbank_sort_var.txt"

df_haplo_mat = file_io.load_haplotype_vars(haplotype_maternal)
print(df_haplo_mat.shape)

df_haplo_pat = file_io.load_haplotype_vars(haplotype_paternal)
print(df_haplo_pat.shape)


df_haplo_pat.head()

(3438809, 5)
(3327344, 5)


Unnamed: 0,chr,start,end,REF,ALT
0,chr1,40639,40639,-,tttt
1,chr1,41052,41053,c,t
2,chr1,41217,41218,t,a
3,chr1,41255,41256,c,t
4,chr1,41387,41388,a,g


In [4]:
df_haplo_mat_indel = df_haplo_mat.query("REF == '-' or ALT == '-'")
print(df_haplo_mat_indel.shape)

df_haplo_pat_indel = df_haplo_pat.query("REF == '-' or ALT == '-'")
print(df_haplo_pat_indel.shape)

(686236, 5)
(663895, 5)


In [5]:
df_covered_repeats = (
    pd.read_csv(
        "../../data/HG002/regions/hg38_ver13_0boe_mononucleotides_union.bed", 
        sep="\t", 
        names=["chr", "start", "end", "period", "unit"])
    .assign(
        start = lambda x: x["start"] + 1, # convert coordinates from 0-based (BED) to 1-based (VCF, PAF)
    ).assign(
        str_id = lambda x: [f"{i}_{j}" for i, j in zip(x["chr"], x["start"])]
    )
)
df_covered_repeats

Unnamed: 0,chr,start,end,period,unit,str_id
0,chr1,588068,588079,3,CCT,chr1_588068
1,chr1,589245,589256,2,TC,chr1_589245
2,chr1,590659,590670,4,AAAT,chr1_590659
3,chr1,590969,590984,4,AAAC,chr1_590969
4,chr1,591734,591751,1,A,chr1_591734
...,...,...,...,...,...,...
1695860,chrY,57188868,57188878,1,A,chrY_57188868
1695861,chrY,57200838,57200851,1,A,chrY_57200838
1695862,chrY,57201000,57201019,1,A,chrY_57201000
1695863,chrY,57202381,57202403,1,A,chrY_57202381


In [6]:
%%time
from multicopy_STR_genotyping import str_utils


df_strs_haplotypes = {
    "str_id": [],
    "region_len_ref": [],
    "region_len_mat": [],
    "region_len_pat": [],
}

for chromosome, data in df_covered_repeats.groupby("chr"):
    print("Starting analysis of sequence:", chromosome)
    df_chr_mat = df_haplo_mat_indel.query(f"chr == '{chromosome}'")
    df_chr_pat = df_haplo_pat_indel.query(f"chr == '{chromosome}'")
    for locus in data.to_dict(orient="records"):
        if locus["chr"] == "chrY":
            mat = pd.NA
        else:
            mat = str_utils.str_len_from_haplotype(locus, df_chr_mat)
            
        if locus["chr"] == "chrX":
            pat = pd.NA
        else:
            pat = str_utils.str_len_from_haplotype(locus, df_chr_pat)
        
        df_strs_haplotypes["str_id"].append(locus["str_id"])
        df_strs_haplotypes["region_len_ref"].append(locus["end"] - locus["start"] + 1)
        df_strs_haplotypes["region_len_mat"].append(mat)
        df_strs_haplotypes["region_len_pat"].append(pat)

df_strs_haplotypes = pd.DataFrame(df_strs_haplotypes)
df_strs_haplotypes

Starting analysis of sequence: chr1
Starting analysis of sequence: chr10
Starting analysis of sequence: chr11
Starting analysis of sequence: chr12
Starting analysis of sequence: chr13
Starting analysis of sequence: chr14
Starting analysis of sequence: chr15
Starting analysis of sequence: chr16
Starting analysis of sequence: chr17
Starting analysis of sequence: chr18
Starting analysis of sequence: chr19
Starting analysis of sequence: chr2
Starting analysis of sequence: chr20
Starting analysis of sequence: chr21
Starting analysis of sequence: chr22
Starting analysis of sequence: chr3
Starting analysis of sequence: chr4
Starting analysis of sequence: chr5
Starting analysis of sequence: chr6
Starting analysis of sequence: chr7
Starting analysis of sequence: chr8
Starting analysis of sequence: chr9
Starting analysis of sequence: chrX
Starting analysis of sequence: chrY
CPU times: user 1h 9min 13s, sys: 23 s, total: 1h 9min 36s
Wall time: 1h 9min 42s


Unnamed: 0,str_id,region_len_ref,region_len_mat,region_len_pat
0,chr1_588068,12,12,12
1,chr1_589245,12,12,12
2,chr1_590659,12,12,12
3,chr1_590969,16,16,16
4,chr1_591734,18,20,20
...,...,...,...,...
1695860,chrY_57188868,11,,11
1695861,chrY_57200838,14,,15
1695862,chrY_57201000,20,,20
1695863,chrY_57202381,23,,23


In [8]:
# df_strs_haplotypes.to_csv("../../data/HG002/variants/HG002_GRCh38_STR_lengthts.csv", index=False)