# Generating a new reference sequence that incorporates specific STR lengths

We want to make a new reference sequence that is based on an existing reference, but has different STR allele lengths based on a list of STR loci provided.

In [1]:
import os
import sys
# SRC_DIR environment variable should be the absolute path to the 'ConSTRain-analyses' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd
import pysam

Load a panel with STR locus annotations for GRCh38

In [3]:
names = ["chr", "start", "end", "period", "unit"]
df_repeats_grch38 = (pd.read_csv("../../data/hg38_ver13_0boe_mononucleotides.bed", sep="\t", names=names)
                         .query("chr == 'chr21'")
                         .assign(str_id = lambda x: [f"{i}_{j}" for i, j in zip(x["chr"], x["start"] + 1)]))

df_repeats_grch38

Unnamed: 0,chr,start,end,period,unit,str_id
926634,chr21,5011211,5011223,4,AATC,chr21_5011212
926635,chr21,5013076,5013088,2,CA,chr21_5013077
926636,chr21,5014891,5014903,2,CA,chr21_5014892
926637,chr21,5016247,5016265,3,GAG,chr21_5016248
926638,chr21,5016788,5016798,1,G,chr21_5016789
...,...,...,...,...,...,...
948133,chr21,46688831,46688863,2,AC,chr21_46688832
948134,chr21,46689711,46689723,4,ATTT,chr21_46689712
948135,chr21,46696281,46696292,1,T,chr21_46696282
948136,chr21,46697534,46697554,1,A,chr21_46697535


STR allele lengths for HG002 cell line that were called based on mappings of the maternal and paternal assemblies to GRCh38. 

In [4]:
df_repeats_hg002 = pd.read_csv("../../data/HG002/variants/HG002_GRCh38_STR_lengths.csv")
df_repeats_hg002 = df_repeats_hg002[df_repeats_hg002["str_id"].str.startswith("chr21")]
df_repeats_hg002 = df_repeats_hg002.merge(df_repeats_grch38[["str_id", "start", "end", "unit", "period"]], on="str_id", how="inner")
df_repeats_hg002

Unnamed: 0,str_id,region_len_ref,region_len_mat,region_len_pat,start,end,unit,period
0,chr21_5354757,12,12.0,12.0,5354756,5354768,ATA,3
1,chr21_7205148,18,18.0,18.0,7205147,7205165,ATA,3
2,chr21_7227685,20,20.0,20.0,7227684,7227704,TTTATTTTTA,10
3,chr21_8987815,16,16.0,16.0,8987814,8987830,CG,2
4,chr21_8987982,10,10.0,10.0,8987981,8987991,CGCCC,5
...,...,...,...,...,...,...,...,...
19452,chr21_46688832,32,24.0,32.0,46688831,46688863,AC,2
19453,chr21_46689712,12,12.0,12.0,46689711,46689723,ATTT,4
19454,chr21_46696282,11,11.0,11.0,46696281,46696292,T,1
19455,chr21_46697535,20,20.0,20.0,46697534,46697554,A,1


Generate maternal and paternal haplotype specific data frames. For loci that are already the same length as in GRCh38, we don't need to change anything. So, select only loci that have different length from GRCh38 (but only differ by a multiple of the STR unit length). 

In [5]:
df_mat_nonref = df_repeats_hg002.query("region_len_ref != region_len_mat and region_len_mat % period == 0").assign(allele_len = lambda x: x["region_len_mat"].convert_dtypes(int) // x["period"])
df_pat_nonref = df_repeats_hg002.query("region_len_ref != region_len_pat and region_len_pat % period == 0").assign(allele_len = lambda x: x["region_len_pat"].convert_dtypes(int) // x["period"])
df_mat_nonref.shape, df_pat_nonref.shape

((3824, 9), (3678, 9))

In [8]:
df_pat_nonref
# df_pat_nonref.query("region_len_pat != region_len_mat and period != 1").head(50)

Unnamed: 0,str_id,region_len_ref,region_len_mat,region_len_pat,start,end,unit,period,allele_len
16,chr21_10382937,18,19.0,19.0,10382936,10382954,A,1,19
21,chr21_10386619,11,10.0,10.0,10386618,10386629,T,1,10
31,chr21_10399238,21,20.0,20.0,10399237,10399258,T,1,20
36,chr21_10403964,35,40.0,40.0,10403963,10403998,TTTAT,5,8
39,chr21_10486318,24,20.0,20.0,10486317,10486341,AT,2,10
...,...,...,...,...,...,...,...,...,...
19412,chr21_46632747,13,14.0,14.0,46632746,46632759,A,1,14
19417,chr21_46639659,11,11.0,12.0,46639658,46639669,T,1,12
19418,chr21_46640439,21,21.0,20.0,46640438,46640459,T,1,20
19428,chr21_46645267,14,119.0,120.0,46645266,46645280,A,1,120


## Generating new chromosome sequences

Load chr21 index through pysam. 

In [194]:
chr21 = pysam.FastaFile("../../data/simulated_reads/GRCh38_chr21.fa.gz")
print(chr21.references, chr21.lengths)

['chr21'] [46709983]


### Paternal haplotye
We do the paternal haplotype first. Based on the STR start and end positions, we determine the regions constituting the 'gaps' between STR loci, and the coordinates of the STR loci themselves. To generate a new chr21 representation, we extract the gaps directly from GRCh38's chr21 sequence. The STR loci (that should all be different from GRCh38 at this point), we will generate on the fly and fill them in between the gaps.

In [200]:
strs_pat = list(zip(df_pat_nonref["unit"], df_pat_nonref["allele_len"]))

gaps_pat = list(zip(df_pat_nonref["end"].shift(1, fill_value=0), df_pat_nonref["start"]))
tail_pat = (df_pat_nonref.at[df_pat_nonref.index[-1], "end"], None) #make sure to keep track of the section of the chromosome after the last STR

print(len(gaps_pat), len(strs_pat))

3678 3678


In [197]:
pat_seq = ""
for (gap_start, gap_end), (unit, period) in zip(gaps_pat, strs_pat):
    pat_seq += chr21.fetch("chr21", gap_start, gap_end)
    pat_seq += unit * period
pat_seq += chr21.fetch("chr21", tail_pat[0], tail_pat[1])

SeqIO.write(
    SeqRecord(Seq(pat_seq), id="HG002_chr21_paternal", description="Sequence based on GRCh38 chr21, with only STR sequences adapted to represent the paternal haplotype of the HG002 cell line"), 
    "../../data/simulated_reads/HG002_chr21_paternal.fa", 
    format="fasta"
)

new_seq = None

### Maternal haplotype
Then, do exactly the same for the maternal haplotype.

In [198]:
strs_mat = list(zip(df_mat_nonref["unit"], df_mat_nonref["allele_len"]))

gaps_mat = list(zip(df_mat_nonref["end"].shift(1, fill_value=0), df_mat_nonref["start"]))
tail_mat = (df_mat_nonref.at[df_mat_nonref.index[-1], "end"], None)

print(len(gaps_mat), len(strs_mat))

3824 3824


In [199]:
mat_seq = ""
for (gap_start, gap_end), (unit, period) in zip(gaps_mat, strs_mat):
    mat_seq += chr21.fetch("chr21", gap_start, gap_end)
    mat_seq += unit * period
mat_seq += chr21.fetch("chr21", tail_mat[0], tail_mat[1])

SeqIO.write(
    SeqRecord(Seq(mat_seq), id="HG002_chr21_maternal", description="Sequence based on GRCh38 chr21, with only STR sequences adapted to represent the maternal haplotype of the HG002 cell line"), 
    "../../data/simulated_reads/HG002_chr21_maternal.fa", 
    format="fasta"
)

new_seq = None