In [None]:
import pyreadr
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from Bio import SeqIO
import sys
import itertools
from tqdm import tqdm
import random
import re
import scipy
import pickle
import os
import time
from sklearn.model_selection import train_test_split
sys.path.insert(0, "../../suffix_array/")
import suffix_array

## Create suffix array

In [9]:
comp_trans = str.maketrans("ACGTMRWSYKVHDBN", "TGCAKYWSRMBDHVN")
fp = f"../samples/Riv19/diff/Riv19_difference.RDS"
result = pyreadr.read_r(fp)
df = result[None]

In [12]:
df['position'] = df['position'].astype(int) - 1 + 3 # convert to 0-based indexing
df.loc[df.dir == 'rev', 'position'] += 1
df = df.set_index(df.position)

In [14]:
# read genome sequence
fp = f"../samples/Riv19/ref_genome/Riv19.fasta"
for record in SeqIO.parse(fp, "fasta"):
    print(record.id)
    seq = record.seq
    print(len(seq))
    sa = suffix_array.get_suffix_array(record.id, seq)

NZ_CP090506.1
2606266


# Generate artificial datasets

In [None]:
def overlaps(a, b):
    """
    Return the amount of overlap, in bp
    between a and b.
    If >0, the number of bp of overlap
    If 0,  they are book-ended.
    If <0, the distance in bp between them
    """
    return min(a[1], b[1]) - max(a[0], b[0])

In [None]:
# select TP
positions = []
positions_rev_compl = []
for motif in ["TTCGAA","GACNNNNNNGTC"]:
    positions.extend(suffix_array.find_motif(motif, sa, poi=4)[0])
    positions_rev_compl.extend(suffix_array.find_motif(motif, sa, poi=4)[1])

positions_filtered = []
positions_rev_compl_filtered = []
for pos in positions:
    flag = True
    for other in positions:
        if pos != other and overlaps([other-22,other+23],[pos-22,pos+23]) > 0:
            flag = False
            break
    if flag:
        positions_filtered.append(pos)
for pos in positions_rev_compl:
    flag = True
    for other in positions_rev_compl:
        if pos != other and overlaps([other-22,other+23],[pos-22,pos+23]) > 0:
            flag = False
            break
    if flag:
        positions_rev_compl_filtered.append(pos)

In [None]:
seq_TP_all = list()
for pos in positions_filtered:
    seq_TP_all.append(str(record[pos-22:pos+23].seq))
for pos in positions_rev_compl_filtered:
    seq_TP_all.append(str(record[pos-22:pos+23].reverse_complement().seq))
seq_TP_all = list(set(seq_TP_all))

In [None]:
TP_num = len(seq_TP_all)

In [None]:
seq_TP_train, seq_TP_test = train_test_split(seq_TP_all, test_size=0.2)
random.shuffle(seq_TP_train)
random.shuffle(seq_TP_test)

In [None]:
# write TP
f1 = open(f"artificial_Riv19/Riv19_TP_train.fasta", "w")
for seq in seq_TP_train:
    f1.write(f">\n{seq}\n")
f2 = open(f"artificial_Riv19/Riv19_TP_test.fasta", "w")
for seq in seq_TP_test:
    f2.write(f">\n{seq}\n")
f1.close()
f2.close()

In [None]:
FP_count = 0
regex1 = r"TTCGAA"
regex2 = r"GAC......GTC"
seq_FP_test = list()
while (FP_count < len(seq_TP_test)):
    for seq in seq_TP_all:
        FP_count += 1
        l = list(seq)
        while len(re.findall(re.compile(regex1, re.IGNORECASE), ''.join(l))) > 0 or len(re.findall(re.compile(regex2, re.IGNORECASE), ''.join(l))) > 0:
            random.shuffle(l)
        seq_FP_test.append(''.join(l))
        if FP_count == len(seq_TP_test):
            break

In [None]:
# write FP
f3 = open(f"../artificial_Riv19_new/Riv19_FP_test.fasta", "w")
for seq in seq_FP_test:
    f3.write(f">\n{seq}\n")
f3.close()

In [None]:
seq_FP_train = list()
max_FP_count = len(seq_TP_train) * 10
FP_count = 0
regex1 = r"TTCGAA"
regex2 = r"GAC......GTC"
seq_FP_train = list()
while (FP_count < max_FP_count):
    for seq in seq_TP_all:
        FP_count += 1
        l = list(seq)
        while len(re.findall(re.compile(regex1, re.IGNORECASE), ''.join(l))) > 0 or \
              len(re.findall(re.compile(regex2, re.IGNORECASE), ''.join(l))) > 0 or \
              ''.join(l) in seq_FP_test:
            random.shuffle(l)
        seq_FP_train.append(''.join(l))
        if FP_count == max_FP_count:
            break

In [None]:
def create_TPR_file_train(fraction, seq_TP_train, seq_FP_train_all):
    assert(fraction <= 1.0)

    FP_num = int(np.round(TP_num / fraction - TP_num))
    print(f"|TP|={TP_num}, |FP|={FP_num}, |P|={TP_num+FP_num}, |FP|/|P| = {FP_num/(TP_num+FP_num)}, |TP|/|P| = {TP_num/(TP_num+FP_num)}")

    seq_FP_train = seq_FP_train_all[:FP_num]

    f_train = open(f"artificial_Riv19/Riv19_TPR_{fraction}_train.fasta", "w")
    seq_train = seq_TP_train + seq_FP_train
    random.shuffle(seq_train)
    for seq in seq_train:
        f_train.write(f">\n{seq}\n")
    f_train.close()

In [None]:
for fraction in [0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]:
    create_TPR_file_train(fraction, seq_TP_train, seq_FP_train)