In [1]:
import pandas as pd
from collections import defaultdict
from itertools import product
import numpy as np
import scipy.stats
from scipy.stats import norm
import matplotlib.pyplot as plt 
import seaborn as sns
import statistics

In [2]:
import os
os.listdir("../")

['data', 'notebook']

In [3]:
path_input = "../data/"
path_output = "../data/mer6/"
os.makedirs(path_output, exist_ok=True)

In [4]:
os.listdir("../data/")

['mer6',
 'Supplementary Table10_apex_mRNA_transcripts.csv',
 'Supplementary Table9_apex_lncRNA_transcripts.csv',
 'Supplementary_Table3_lncRNA_train_seq_with_RCI_longest_transcripts.csv',
 'Supplementary_Table4_lncRNA_test_seq_with_RCI_longest_transcripts.csv',
 'Supplementary_Table5_mRNA_train_seq_with_RCI_longest_transcripts.csv',
 'Supplementary_Table6_mRNA_test_seq_with_RCI_longest_transcripts.csv',
 'Supplementary_Table7_mean_RCI_positive.canonical.tsv',
 'Supplementary_Table8_mean_RCI_negative.canonical.tsv']

In [5]:
build_kmer_for_seq = "Supplementary_Table3_lncRNA_train_seq_with_RCI_longest_transcripts.csv"

In [6]:
train_set = pd.read_csv(path_input + build_kmer_for_seq,index_col=0)

# train_A549 = train_set.iloc[:,:6].dropna(how="any").reset_index(drop=True)
# mean = train_A549["A549"].mean()
# std = np.std(train_A549["A549"])
# print("mean " + str(round(mean,3)))
# print("std "+str(round(std,3)))
# print("max "+str(round(train_A549["A549"].max(),3)))
# print("min "+str(round(train_A549["A549"].min(),3)))
# print("The size of test set: "+str(len(train_set)))
# print(train_A549.head(2))

# fig = plt.figure()
# sns.set_style('darkgrid')

# ax = sns.histplot(train_A549["A549"],kde=True, stat="density")
# plt.axvline(mean,color="red",label = 'mean '+str(round(mean,3)))
# plt.axvline(mean+std,color="yellow",label = 'mean + 0.25 std '+str(round(mean+0.25*std,3))+". With "+str(len(train_A549[train_A549["A549"]>mean+0.25*std]))+" entries")
# plt.axvline(mean-std,color="brown",label = 'mean - 0.25 std '+str(round(mean-0.25*std,3))+". With "+str(len(train_A549[train_A549["A549"]<mean-0.25*std]))+" entries")
# plt.legend(loc = 'upper right')

# plt.show()

## 6mer. cnts0,cnts1,cnts2 means exact, 1 mismatch, 2 mismatch 

In [7]:
def hamming_distance(x,y):
    """Calculate the Hamming distance between two bit strings"""
    return sum(xi != yi for xi, yi in zip(x, y))

## cnts0,cnts1,cnts2 means exact, 1 mismatch, 2 mismatch 
def occurrences(seq, qmers,q):
    cnts0,cnts1,cnts2,cnts3 = defaultdict(int),defaultdict(int),defaultdict(int),defaultdict(int)
    length = len(seq)
    row0 = {q:i for q,i in zip(qmers, np.zeros(4**q))}
    row1 = {q:i for q,i in zip(qmers, np.zeros(4**q))}
    row2 = {q:i for q,i in zip(qmers, np.zeros(4**q))}
    row3 = {q:i for q,i in zip(qmers, np.zeros(4**q))}
    
    idx = np.zeros(len(seq),dtype="int")
    increment = 1
    for c in range(length-q+1):
        if seq[c] in ["A","C","G","T"]:
            kmer = seq[c:c+q]
            if idx[c] == 1 :
                continue
            else:
                for j in range(length-q+1):
                    dist = hamming_distance(kmer, seq[j:j+q])
                    if dist == 0:
                        cnts0[kmer] += increment
                        idx[j] = 1
                    elif dist == 1:
                        cnts1[kmer] += increment
                    elif dist == 2:
                        cnts2[kmer] += increment
                    elif dist == 3:
                        cnts3[kmer] += increment
                        
        else:
            continue
     
        
    for _kmer, n in cnts0.items():
        if _kmer in row0:
            row0[_kmer] = n
    for _kmer, n in cnts1.items():
        if _kmer in row1:
            row1[_kmer] = n
    for _kmer, n in cnts2.items():
        if _kmer in row2:
            row2[_kmer] = n
    for _kmer, n in cnts3.items():
        if _kmer in row3:
            row3[_kmer] = n

            
    return row0.values(),row1.values(),row2.values(),row3.values()

cols_train = train_set.columns


q = 6
kmers = [''.join(i) for i in product("ACGT", repeat=q)]

seqs = train_set.sequence
#         map = {q:i for q,i in zip(kmers, np.zeros(4**q))}
counts0 = np.zeros([len(train_set), 4**q], dtype=int)
counts1 = np.zeros([len(train_set), 4**q], dtype=int)
counts2 = np.zeros([len(train_set), 4**q], dtype=int)
counts3 = np.zeros([len(train_set), 4**q], dtype=int)

for i in range(len(train_set)):
    a,b,c,d = occurrences(seqs[i],kmers,q)
    counts0[i] = list(a)
    counts1[i] = list(b)
    counts2[i] = list(c)
    counts3[i] = list(d)

    if i % 800==0:
        print("counts ", i)
    if i == 1:
        break
kmer_counts0 = pd.DataFrame(data=counts0,columns=list(kmers))
kmer_counts1 = pd.DataFrame(data=counts1,columns=list(kmers))
kmer_counts2 = pd.DataFrame(data=counts2,columns=list(kmers))
kmer_counts3 = pd.DataFrame(data=counts3,columns=list(kmers))

_kmer0miss = kmer_counts0
_kmer1miss = kmer_counts0 + kmer_counts1
_kmer2miss = kmer_counts0 + kmer_counts1 + kmer_counts2
_kmer3miss = kmer_counts0 + kmer_counts1 + kmer_counts2 + kmer_counts3


_kmer0miss[cols_train] = train_set[cols_train]
_kmer1miss[cols_train] = train_set[cols_train]
_kmer2miss[cols_train] = train_set[cols_train]
_kmer3miss[cols_train] = train_set[cols_train]



_kmer0miss.to_csv(path_output + "rawcounts_coding_apex_6mer0miss_200-5000_longest.csv",mode='w',encoding='utf8')
_kmer1miss.to_csv(path_output + "rawcounts_coding_apex_6mer1miss_200-5000_longest.csv",mode='w',encoding='utf8')
_kmer2miss.to_csv(path_output + "rawcounts_coding_apex_6mer2miss_200-5000_longest.csv",mode='w',encoding='utf8')
_kmer3miss.to_csv(path_output + "rawcounts_coding_apex_6mer3miss_200-5000_longest.csv",mode='w',encoding='utf8')

counts  0
