In [1]:
import os
import sys
# SRC_DIR environment variable should be the absolute path to the 'ConSTRain-analyses' directory
sys.path.append(os.environ["SRC_DIR"])

In [2]:
from Bio import SeqIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from multicopy_STR_genotyping import file_io

sns.set_context("poster")
%matplotlib inline

## Load and filter mreps repeats

Parsing mreps output files and filtering STRs based on allele length.
The thresholds for filtering per period are defined in the 'thresholds' dictionary. 
Keys are STR periods and values are the minimum number of times the repeat unit must be repeated in order to keep the locus.

In [4]:
%%time
chromosomes = [
    "chr01",
    "chr02",
    "chr03",
    "chr04",
    "chr05",
    "chr06",
    "chr07",
    "chr08",
    "chr09",
    "chr10",
    "chr11",
]
thresholds = {
    1: 10,
    2: 6,
    3: 4,
    4: 3,
    5: 3,
    6: 3,
}

df_mreps = []
for chrom in chromosomes:
    mreps_file = f"../../data/banana/repeats/{chrom}_mreps.out"
    print(f"Parsing {mreps_file} ...")
    df_mreps_chrom = file_io.load_mreps_repeats(mreps_file, chrom, thresholds)
    df_mreps.append(df_mreps_chrom)

# Add all mreps DFs together, and substract 1 from start position to convert to 0-based open-ended coordinates
df_mreps = pd.concat(df_mreps).assign(start = lambda x: x["start"] - 1).reset_index(drop=True)
df_mreps

Parsing ../../data/banana/repeats/chr01_mreps.out ...
Parsing ../../data/banana/repeats/chr02_mreps.out ...
Parsing ../../data/banana/repeats/chr03_mreps.out ...
Parsing ../../data/banana/repeats/chr04_mreps.out ...
Parsing ../../data/banana/repeats/chr05_mreps.out ...
Parsing ../../data/banana/repeats/chr06_mreps.out ...
Parsing ../../data/banana/repeats/chr07_mreps.out ...
Parsing ../../data/banana/repeats/chr08_mreps.out ...
Parsing ../../data/banana/repeats/chr09_mreps.out ...
Parsing ../../data/banana/repeats/chr10_mreps.out ...
Parsing ../../data/banana/repeats/chr11_mreps.out ...
CPU times: user 29.6 s, sys: 1.09 s, total: 30.7 s
Wall time: 31.1 s


Unnamed: 0,str_id,chr,start,end,unit,period,ref
0,chr01_22744,chr01,22743,22755,CCCT,4,3
1,chr01_33907,chr01,33906,33924,CCTTTG,6,3
2,chr01_35279,chr01,35278,35290,TA,2,6
3,chr01_35300,chr01,35299,35309,T,1,10
4,chr01_35392,chr01,35391,35417,AT,2,13
...,...,...,...,...,...,...,...
269501,chr11_34628454,chr11,34628453,34628468,CTCTT,5,3
269502,chr11_34629565,chr11,34629564,34629582,CATTTC,6,3
269503,chr11_34630651,chr11,34630650,34630665,CTCTT,5,3
269504,chr11_34631765,chr11,34631764,34631782,CATTTC,6,3


## Check if mreps repeats match reference

mreps randomly replaces 'N' characters in the input sequence with A, C, G, or T. This has a chance to introduce artificial repeats that do not exist in the reference genome. 

To address this, we check for every repeat reported by mreps whether the repeat sequence fully matches the specified region in the reference genome.

In [5]:
%%time
parser = SeqIO.parse("../../data/banana/reference/Musa_acuminata_pahang_v4.fasta", "fasta")
matches_ref = []
df_mismatches = {
    "str_id": [],
    "mreps_seq": [],
    "ref_seq": [],
}

for record in parser:
    if record.id not in chromosomes:
        continue
    print(f"Analysing repeats on chromosome {record.id} ...")
    df_chrom = df_mreps.query(f"chr == '{record.id}'")
    for repeat in df_chrom.to_dict(orient="records"):
        mreps_seq = repeat["unit"] * repeat["ref"]
        ref_seq = str(record.seq[repeat["start"]:repeat["end"]]).upper() # BioPython slicing is BED-like
        match = mreps_seq == ref_seq
        matches_ref.append(match)
        if not match:
            df_mismatches["str_id"].append(repeat["str_id"])            
            df_mismatches["mreps_seq"].append(mreps_seq)
            df_mismatches["ref_seq"].append(ref_seq)
            
matches_ref = np.array(matches_ref)
df_mismatches = pd.DataFrame(df_mismatches)

Analysing repeats on chromosome chr01
Analysing repeats on chromosome chr02
Analysing repeats on chromosome chr03
Analysing repeats on chromosome chr04
Analysing repeats on chromosome chr05
Analysing repeats on chromosome chr06
Analysing repeats on chromosome chr07
Analysing repeats on chromosome chr08
Analysing repeats on chromosome chr09
Analysing repeats on chromosome chr10
Analysing repeats on chromosome chr11
CPU times: user 3.61 s, sys: 911 ms, total: 4.53 s
Wall time: 4.74 s


In [6]:
print(f"{df_mismatches.shape[0]} / {df_mreps.shape[0]} repeats were artifacts generated by mreps")
df_mismatches

8 / 269506 repeats were artifacts generated by mreps


Unnamed: 0,str_id,mreps_seq,ref_seq
0,chr01_29246016,TATGTATGTATG,NNNNNNNNNNNN
1,chr01_29263129,CGGGCGGGCGGG,NNNNNNNNNNNN
2,chr05_29674490,TATGTATGTATG,NNNNNNNNNNNN
3,chr05_29691603,CGGGCGGGCGGG,NNNNNNNNNNNN
4,chr08_38286544,TATGTATGTATG,NNNNNNNNNNNN
5,chr08_38303657,CGGGCGGGCGGG,NNNNNNNNNNNN
6,chr08_38479785,AGGAAGGAAGGA,NNNNNNNNNNNN
7,chr08_38561761,GGGGGGGGGG,NNNNNNNNNN


In [7]:
df_mreps = df_mreps.loc[matches_ref].reset_index(drop=True)
df_mreps

Unnamed: 0,str_id,chr,start,end,unit,period,ref
0,chr01_22744,chr01,22743,22755,CCCT,4,3
1,chr01_33907,chr01,33906,33924,CCTTTG,6,3
2,chr01_35279,chr01,35278,35290,TA,2,6
3,chr01_35300,chr01,35299,35309,T,1,10
4,chr01_35392,chr01,35391,35417,AT,2,13
...,...,...,...,...,...,...,...
269493,chr11_34628454,chr11,34628453,34628468,CTCTT,5,3
269494,chr11_34629565,chr11,34629564,34629582,CATTTC,6,3
269495,chr11_34630651,chr11,34630650,34630665,CTCTT,5,3
269496,chr11_34631765,chr11,34631764,34631782,CATTTC,6,3


In [10]:
# If there is another repeat within 'flanksize' before or after a repeat, it counts as a neighbour
flanksize = 50

df_neighbour = []
for chrom, data in df_mreps.groupby("chr"):
    # data = data.sort_values("start") # sorting is essential here
    
    dist_to_prev = data["start"] - np.roll(data["end"], 1)
    dist_to_prev.iloc[0]= np.nan
    
    dist_to_next = np.roll(data["start"], -1) - data["end"]
    dist_to_next.iloc[-1] = np.nan

    data = data.assign(
        has_neighbour = lambda x: [i < flanksize or j < flanksize for i, j in zip(dist_to_prev, dist_to_next)]
    )
    df_neighbour.append(data)
df_neighbour = pd.concat(df_neighbour).reset_index(drop=True)
print(f"{df_neighbour['has_neighbour'].sum()} / {df_neighbour.shape[0]} repeats have a neighbour within {flanksize}bp")
df_neighbour

86153 / 269498 repeats have a neighbour within 50bp


Unnamed: 0,str_id,chr,start,end,unit,period,ref,has_neighbour
0,chr01_22744,chr01,22743,22755,CCCT,4,3,False
1,chr01_33907,chr01,33906,33924,CCTTTG,6,3,False
2,chr01_35279,chr01,35278,35290,TA,2,6,True
3,chr01_35300,chr01,35299,35309,T,1,10,True
4,chr01_35392,chr01,35391,35417,AT,2,13,False
...,...,...,...,...,...,...,...,...
269493,chr11_34628454,chr11,34628453,34628468,CTCTT,5,3,False
269494,chr11_34629565,chr11,34629564,34629582,CATTTC,6,3,False
269495,chr11_34630651,chr11,34630650,34630665,CTCTT,5,3,False
269496,chr11_34631765,chr11,34631764,34631782,CATTTC,6,3,False


In [16]:
df_reformat = df_neighbour[~df_neighbour["has_neighbour"]].filter(["chr", "start", "end", "period", "unit"]).reset_index(drop=True)
df_reformat

Unnamed: 0,chr,start,end,period,unit
0,chr01,22743,22755,4,CCCT
1,chr01,33906,33924,6,CCTTTG
2,chr01,35391,35417,2,AT
3,chr01,35828,35842,2,AT
4,chr01,37880,37930,2,AT
...,...,...,...,...,...
183340,chr11,34628453,34628468,5,CTCTT
183341,chr11,34629564,34629582,6,CATTTC
183342,chr11,34630650,34630665,5,CTCTT
183343,chr11,34631764,34631782,6,CATTTC


In [20]:
# df_reformat.to_csv("../../data/banana/repeats/dh_pahang_v4_strs.bed", header=False, index=False, sep="\t")