In [2]:
import os
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [3]:

def data_extract(DeepMirTar_txt, dt='DeepMirTar'):

    encode = dict(zip('NAUCG', range(5)))

    df = pd.read_csv(DeepMirTar_txt, sep="\t", header=None)
    #print(df.iloc[0])
    assert dt in ['DeepMirTar', 'miRaw']
    if dt=='DeepMirTar':
        df = df.drop(df.index[0]).reset_index(drop=True)

    df = df[df[3].str.len()<=40] ## remove 52_samples miRNA are more than 40

    max_RNA = max(df[1].str.len().max(), 26)
    max_miRNA = max(df[3].str.len().max(), 40)
    print(max_RNA, max_miRNA)
    
    df[1] = [x + 'N'*(max_RNA-len(x)) for x in df[1].tolist()]
    df[3] = [x.replace('T','U') + 'N'*(max_miRNA-len(x)) for x in df[3].tolist()]
    df[5] = df[1] + df[3]
    df['data'] = df[5].apply(lambda x: np.array([encode[a.upper()] for a in x], np.int64))
    
    DeepMirTar_data = np.stack(df['data'])
    DeepMirTar_label = np.stack(df[4])
    DeepMirTar_label = np.expand_dims(DeepMirTar_label, -1)
    DeepMirTar_label = np.int64(DeepMirTar_label)
    return DeepMirTar_data, DeepMirTar_label


In [6]:

def process_data():
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    data = {}
    for dt in ['DeepMirTar', 'miRaw']:
        data_txt = 'data/data_miRaw_noL_noMisMissing_remained_seed1122.txt' if dt=='miRaw' else 'data/data_DeepMirTar_removeMisMissing_remained_seed1122.txt'
        data_txt_ind = 'data/data_miRaw_noL_noMisMissing_indTest_seed1122_Unique.txt' if dt=='miRaw' else 'data/data_DeepMirTar_test.txt'
        
        data[dt] = {}
        Seq_data, Seq_label = data_extract(data_txt, dt=dt)
        for i, (_, fol_index) in enumerate(skf.split(Seq_data, Seq_label)):
            data[dt][i] = [Seq_data[fol_index], Seq_label[fol_index]]
    
        Seq_data_ind, Seq_label_ind = data_extract(data_txt_ind, dt=dt)
        data[dt][5] = [Seq_data_ind, Seq_label_ind]
    
        with open('data_full_0411.pickle', 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def get_Seq_train(data, data_dt, fol):

    idxs = np.arange(4).tolist()
    idxs.remove(fol)
    
    Seq_train = []
    Seq_train_label = []
    
    for i in idxs:
        x, y = data[data_dt][i]
        Seq_train.append(x)
        Seq_train_label.append(y)
        
    Seq_train = np.concatenate(Seq_train, axis=0)
    Seq_train_label = np.concatenate(Seq_train_label, axis=0)
    return Seq_train, Seq_train_label


In [None]:
# open the data store in pickle
def load_data(data_dict, dt, fol):

    with open(data_dict, 'rb') as handle:
        data= pickle.load(handle)
    
    if dt == 0:
        data_dt = 'miRaw' 
    elif dt == 1:
        data_dt = 'DeepMirTar'
    else:
        print('incorrect dataset')
        assert False
    # check the data is correct
    assert data_dt in data
    # print the data
    # for key, value in data.items():
    #     print(f"{key}: {value}")
    Seq_ind, Seq_ind_label = data[data_dt][4]
    print(Seq_ind.shape, Seq_ind_label.shape, Seq_ind_label.dtype, Seq_ind_label[:2])
    
    Seq_val, Seq_val_label = data[data_dt][fol]
    print(Seq_val.shape, Seq_val_label.shape, Seq_val_label.dtype, Seq_val_label[:2])
    
    Seq_train, Seq_train_label = get_Seq_train(data, data_dt, fol)
    print(Seq_train.shape, Seq_train_label.shape, Seq_train_label.dtype, Seq_train_label[:2])

    return Seq_train, Seq_train_label, Seq_val, Seq_val_label, Seq_ind, Seq_ind_label

data_dict = 'data_full_0411.pickle'
fol = 3
# read 'DeepMirTar'
dt = 1

Seq_train, Seq_train_label, Seq_val, Seq_val_label, Seq_ind, Seq_ind_label = load_data(data_dict, dt, fol)

(1541, 66) (1541, 1) int64 [[1]
 [1]]
(1541, 66) (1541, 1) int64 [[1]
 [1]]
(4624, 66) [[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]] int64 [[1]
 [1]]


  data= pickle.load(handle)


In [12]:
with open(data_dict, 'rb') as handle:
    data= pickle.load(handle)

for key in data['miRaw']:
    print(key)

0
1
2
3
4
5


  data= pickle.load(handle)


In [None]:
DeepMiTar_Test = 'data/data_DeepMirTar_removeMisMissing_remained_seed1122.txt'
DeepMiTarln = 'data/data_DeepMirTar_test.txt'
miRaw_test = 'data/data_miRaw_noL_noMisMissing_remained_seed1122.txt'
miRawln = 'data/data_miRaw_noL_noMisMissing_indTest_seed1122_Unique.txt'
# DeepMiTar_Test
DeepMiTar_Test_data = {'Positive': 0, 'Negative': 0, 'Number': 0}
with open(DeepMiTar_Test, 'r') as file1:
    for line in file1:
        row = line.strip().split('\t')
        if row[4] == '1':
            DeepMiTar_Test_data['Positive']+=1
            DeepMiTar_Test_data['Number']+=1
        elif row[4] == '0':
            DeepMiTar_Test_data['Negative']+=1
            DeepMiTar_Test_data['Number']+=1
print("DeepMiTar_Test:")
print(DeepMiTar_Test_data)

# DeepMiTarln
DeepMiTarln_data = {'Positive': 0, 'Negative': 0, 'Number': 0}
with open(DeepMiTarln, 'r') as file4:
    for line in file1:
        row = line.strip().split('\t')
        if row[4] == '1':
            DeepMiTarln_data['Positive']+=1
            DeepMiTarln_data['Number']+=1
        elif row[4] == '0':
            DeepMiTarln_data['Negative']+=1
            DeepMiTarln_data['Number']+=1
print("DeepMiTarln:")
print(DeepMiTarln_data)

# miRaw_test
miRaw_test_data = {'Positive': 0, 'Negative': 0, 'Number': 0}
with open(miRaw_test, 'r') as file3:
    for line in file3:
        row = line.strip().split('\t')
        if row[4] == '1':
            miRaw_test_data['Positive']+=1
            miRaw_test_data['Number']+=1
        elif row[4] == '0':
            miRaw_test_data['Negative']+=1
            miRaw_test_data['Number']+=1
print("miRaw_test:")
print(miRaw_test_data)

# DeepMiTarln
miRawln_data = {'Positive': 0, 'Negative': 0, 'Number': 0}
with open(miRawln, 'r') as file2:
    for line in file2:
        row = line.strip().split('\t')
        if row[4] == '1':
            miRawln_data['Positive']+=1
            miRawln_data['Number']+=1
        elif row[4] == '0':
            miRawln_data['Negative']+=1
            miRawln_data['Number']+=1
print("miRawln:")
print(miRawln_data)

DeepMiTar_Test:
{'Positive': 3908, 'Negative': 3850, 'Number': 7758}
miRaw_test:
{'Positive': 31660, 'Negative': 30993, 'Number': 62653}
miRawln:
{'Positive': 929, 'Negative': 890, 'Number': 1819}
