In [None]:
"""
Generate poly-A dataset from human data only.
Poly-A data is concluded from DeeReCT-PolyA and the DeeReCT-PolyA model uses 5-fold cross validation.
Three of them for training, one for validation, and one for testing.
For this section, first, second, and third fold are used for training; fourth for validation; and fifth for testing.

DeeReCT-PolyA uses multiple datasets: dragon human (Kalkatawi et. al., 2012) and Omni human (Magana-Mora et. al., 2017).
Omni dataset is chosen because it's relatively new (2017 vs 2012) and contains more data (Xia et. al., 2018).
"""
dragon_human_pos_dir = './data/poly-a/deerectpolya/human/dragon_polyA_data/positive5fold'
dragon_human_neg_dir = './data/poly-a/deerectpolya/human/dragon_polyA_data/negative5fold'
omni_human_pos_dir = './data/poly-a/deerectpolya/human/omni_polyA_data/positive'
omni_human_neg_dir = './data/poly-a/deerectpolya/human/omni_polyA_data/negative'

from os import listdir
from os.path import isfile, basename

pos_dir = omni_human_pos_dir
neg_dir = omni_human_neg_dir
pos_files = listdir(pos_dir)
pos_files = ['{}/{}'.format(pos_dir, a) for a in listdir(pos_dir) if isfile('{}/{}'.format(pos_dir, a))]
neg_files = listdir(neg_dir)
neg_files = ['{}/{}'.format(neg_dir, a) for a in listdir(neg_dir) if isfile('{}/{}'.format(neg_dir, a))]

#print(len(pos_files))
#print(len(neg_files))

dataset_dir = './dataset/poly-a'
pos_dataset_path = '{}/pos_polya.csv'.format(dataset_dir)
neg_dataset_path = '{}/neg_polya.csv'.format(dataset_dir)

files = [(pos_files, pos_dataset_path, '1'), (neg_files, neg_dataset_path, '0')]
for p in files:
    fs = p[0]
    dataset_path = p[1]
    label = p[2]

    if os.path.exists(dataset_path):
        os.remove(dataset_path)
    t = open(dataset_path, 'x')
    t.write('{}\n'.format(','.join(['sequence', 'label'])))
    for fpath in fs:
        f = {}
        try:
            f = open(fpath, 'r')
            for line in f:
                line = line.strip()
                t.write('{},{}\n'.format(line, label))
        except Exception as e:
            print('Error {}'.format(e))
            f.close()
        finally:
            f.close()

    t.close()

dataset_dir = './dataset/poly-a'
pos_dataset_path = '{}/pos_polya.csv'.format(dataset_dir)
neg_dataset_path = '{}/neg_polya.csv'.format(dataset_dir)

n_sample = 100

import pandas as pd

polya_pos_df = pd.read_csv(pos_dataset_path)
polya_neg_df = pd.read_csv(neg_dataset_path)

sample_polya_pos_df = polya_pos_df.sample(n=n_sample, random_state=1337)
sample_polya_neg_df = polya_pos_df.sample(n=n_sample, random_state=1337)

pos_train_df = sample_polya_pos_df.sample(frac=0.8, random_state=1337)
neg_train_df = sample_polya_neg_df.sample(frac=0.8, random_state=1337)
pos_val_df = sample_polya_pos_df.drop(pos_train_df.index)
neg_val_df = sample_polya_neg_df.drop(neg_train_df.index)
pos_test_df = pos_val_df.sample(frac=0.5, random_state=1337)
neg_test_df = neg_val_df.sample(frac=0.5, random_state=1337)
pos_val_df = pos_val_df.drop(pos_test_df.index)
neg_val_df = neg_val_df.drop(neg_test_df.index)

# Merge pos & neg train, val, and test data.
polya_sample_train = pos_train_df.append(neg_train_df)
polya_sample_val = pos_val_df.append(neg_val_df)
polya_sample_test = pos_test_df.append(neg_test_df)

# Write those datasets.
polya_sample_train.to_csv('./sample/polya/training_sample.csv')
polya_sample_val.to_csv('./sample/polya/validation_sample.csv')
polya_sample_test.to_csv('./sample/polya/test_sample.csv')

In [1]:
"""
Split positive and negative polya data into three parts for training, validation, and test set.
Process both data using pandas.
"""
import pandas as pd
import os
from data_preparation import generate_datasets, generate_sample

polya_dataset_dir = './dataset/polya'
pos_polya_dataset = '{}/pos_polya.csv'.format(polya_dataset_dir)
neg_polya_dataset = '{}/neg_polya.csv'.format(polya_dataset_dir)
pos_polya_balanced = '{}/pos_polya_balanced.csv'.format(polya_dataset_dir)
neg_polya_balanced = '{}/neg_polya_balanced.csv'.format(polya_dataset_dir)
pos_polya_dataset_dir = '{}/pos_polya_dataset'.format(polya_dataset_dir)
neg_polya_dataset_dir = '{}/neg_polya_dataset'.format(polya_dataset_dir)

pos_polya_df = pd.read_csv(pos_polya_dataset)
neg_polya_df = pd.read_csv(neg_polya_dataset)
pos_polya_df_size = len(pos_polya_df)
neg_polya_df_size = len(neg_polya_df)

# Check if both positive and negative polya datasets are balanced.
# If not, make it balanced.
count = 0
if pos_polya_df_size == neg_polya_df_size:
    print('both are {}. data balance.'.format(pos_polya_df_size))
    count = pos_polya_df_size
else:
    count = pos_polya_df_size if pos_polya_df_size < neg_polya_df_size else neg_polya_df_size
    print('data imbalance at pos = {} and neg = {}.\nSelect count = {}.'.format(pos_polya_df_size, neg_polya_df_size, count))

pos_polya_df_balanced = pos_polya_df.sample(n=count, random_state=1337)
neg_polya_df_balanced = neg_polya_df.sample(n=count, random_state=1337)
pos_polya_df_balanced.to_csv(pos_polya_balanced, index=False)
neg_polya_df_balanced.to_csv(neg_polya_balanced, index=False)

# Split positive data into three parts.
generate_datasets(pos_polya_balanced, pos_polya_dataset_dir)
generate_datasets(neg_polya_balanced, neg_polya_dataset_dir)

polya_dataset_dirs = [pos_polya_dataset_dir, neg_polya_dataset_dir]
files = ['train', 'validation', 'test']
n_sample = {
    'train': 80,
    'validation': 10,
    'test': 10
}
for d in polya_dataset_dirs:
    for f in files:
        fpath = '{}/{}.csv'.format(d, f)
        gpath = '{}/{}_sample.csv'.format(d, f)
        if (os.path.exists(gpath)):
            os.remove(gpath)
        g = open(gpath, 'x')
        g.write('{},{}\n'.format('sequence', 'label'))
        df = pd.read_csv(fpath)
        sample_df = df.sample(n=n_sample[f], random_state=1337)

        for i, r in sample_df.iterrows():
            seq = r['sequence']
            label = r['label']
            if len(seq) > 512:
                kmers = [seq[i:i+512] for i in range(len(seq)+1-512)]
                for mer in kmers:
                    g.write('{},{}\n'.format(mer, label))
            else:
                g.write('{},{}\n'.format(seq, label))
        g.close()


both are 18786. data balance.


In [1]:
"""
Create splice-site dataset.
"""
from os.path import basename

ss_dir = './data/splice-sites/splice-deep/'
pos_acc_ss_hs = '{}/positive_DNA_seqs_acceptor_hs.fa'.format(ss_dir)
pos_don_ss_hs = '{}/positive_DNA_seqs_donor_hs.fa'.format(ss_dir)
neg_acc_ss_hs = '{}/negative_DNA_seqs_acceptor_hs.fa'.format(ss_dir)
neg_don_ss_hs = '{}/negative_DNA_seqs_donor_hs.fa'.format(ss_dir)

ss_dataset_dir = './dataset/splice-sites'
pos_ss_acc_dataset = '{}/pos_ss_acc_hs.csv'.format(ss_dataset_dir)
pos_ss_don_dataset = '{}/pos_ss_don_hs.csv'.format(ss_dataset_dir)
neg_ss_acc_dataset = '{}/neg_ss_acc_hs.csv'.format(ss_dataset_dir)
neg_ss_don_dataset = '{}/neg_ss_don_hs.csv'.format(ss_dataset_dir)


In [2]:
files = [(pos_acc_ss_hs, 1, 'acc', pos_ss_acc_dataset), 
            (pos_don_ss_hs, 1, 'don', pos_ss_don_dataset), 
            (neg_acc_ss_hs, 0, 'acc', neg_ss_acc_dataset), 
            (neg_don_ss_hs, 0, 'don', neg_ss_don_dataset)]
for p in files:
    fname = p[0]
    label = p[1]
    acc_don = p[2]
    dataset_path = p[3]

    f = {}
    t = {}
    if os.path.exists(dataset_path):
        os.remove(dataset_path)
    try:
        f = open(fname, 'r')
        t = open(dataset_path, 'x')
        t.write('{}\n'.format(','.join(['sequence', 'label'])))

        for line in f:
            line = line.strip()
            #if len(line) > 512:
                #kmers = [line[i:i+512] for i in range(0, len(line)+1-512)] # Make sure everything is 512 character.
                #for mer in kmers:
                #    t.write('{},{}\n'.format(mer, label))        
            #else:
            t.write('{},{}\n'.format(line, label))
        t.close()
        f.close()
    except Exception as e:
        print('Error {}'.format(e))
        t.close()
        f.close()


In [3]:
"""
Create train, validation, and test set for splice site. To do that, the data need to be balance.
If not the sampling based on smallest count is required. Processing is done using pandas.
"""
import pandas as pd

pos_ss_acc_df = pd.read_csv(pos_ss_acc_dataset)
pos_ss_don_df = pd.read_csv(pos_ss_don_dataset)
neg_ss_acc_df = pd.read_csv(neg_ss_acc_dataset)
neg_ss_don_df = pd.read_csv(neg_ss_don_dataset)

In [4]:
# Because loading the dataframe is time consuming, leave the loading at cell above and do later processing here.
pos_ss_acc_size = len(pos_ss_acc_df)
pos_ss_don_size = len(pos_ss_don_df)
neg_ss_acc_size = len(neg_ss_acc_df)
neg_ss_don_size = len(neg_ss_don_df)

count = 0
if pos_ss_acc_size == pos_ss_don_size == neg_ss_acc_size == neg_ss_don_size:
    print('dataset balance')
    count = pos_ss_acc_size
else:
    print('dataset imbalance')
    print('pos acc {}\npos don {}\nneg acc {}\nneg don {}'.format(pos_ss_acc_size, pos_ss_don_size, neg_ss_acc_size, neg_ss_don_size))
    count = min([pos_ss_acc_size, pos_ss_don_size, neg_ss_acc_size, neg_ss_don_size])
    print('count = {}'.format(count))

pos_ss_acc_df_sample = pos_ss_acc_df.sample(n=count, replace=False, random_state=1337)
pos_ss_don_df_sample = pos_ss_don_df.sample(n=count, replace=False, random_state=1337)
neg_ss_acc_df_sample = neg_ss_acc_df.sample(n=count, replace=False, random_state=1337)
neg_ss_don_df_sample = neg_ss_don_df.sample(n=count, replace=False, random_state=1337)


dataset imbalance
pos acc 248150
pos don 250400
neg acc 248150
neg don 250400
count = 248150


In [5]:
pos_ss_acc_balanced = './dataset/splice-sites/pos_ss_acc_balanced.csv'
pos_ss_don_balanced = './dataset/splice-sites/pos_ss_don_balanced.csv'
neg_ss_acc_balanced = './dataset/splice-sites/neg_ss_acc_balanced.csv'
neg_ss_don_balanced = './dataset/splice-sites/neg_ss_don_balanced.csv'

pos_ss_acc_df_sample.to_csv(pos_ss_acc_balanced, index=False)
pos_ss_don_df_sample.to_csv(pos_ss_don_balanced, index=False)
neg_ss_acc_df_sample.to_csv(neg_ss_acc_balanced, index=False)
neg_ss_don_df_sample.to_csv(neg_ss_don_balanced, index=False)

In [6]:
from data_preparation import generate_datasets

pos_ss_acc_balanced_dir = './dataset/splice-sites/pos_ss_acc_dataset'
pos_ss_don_balanced_dir = './dataset/splice-sites/pos_ss_don_dataset'
neg_ss_acc_balanced_dir = './dataset/splice-sites/neg_ss_acc_dataset'
neg_ss_don_balanced_dir = './dataset/splice-sites/neg_ss_don_dataset'

dirs = [pos_ss_acc_balanced_dir, pos_ss_don_balanced_dir, neg_ss_acc_balanced_dir, neg_ss_don_balanced_dir]
for d in dirs:
    if os.path.exists(d):
        os.mkdir(d)

pos_ss_acc_dataset = generate_datasets(pos_ss_acc_balanced, pos_ss_acc_balanced_dir)
pos_ss_don_dataset = generate_datasets(pos_ss_don_balanced, pos_ss_don_balanced_dir)
neg_ss_acc_dataset = generate_datasets(neg_ss_acc_balanced, neg_ss_acc_balanced_dir)
neg_ss_don_dataset = generate_datasets(neg_ss_don_balanced, neg_ss_don_balanced_dir)

print(pos_ss_acc_dataset)
print(pos_ss_don_dataset)
print(neg_ss_acc_dataset)
print(neg_ss_don_dataset)

['./dataset/splice-sites/pos_ss_acc_dataset/train.csv', './dataset/splice-sites/pos_ss_acc_dataset/validation.csv', './dataset/splice-sites/pos_ss_acc_dataset/test.csv']
['./dataset/splice-sites/pos_ss_don_dataset/train.csv', './dataset/splice-sites/pos_ss_don_dataset/validation.csv', './dataset/splice-sites/pos_ss_don_dataset/test.csv']
['./dataset/splice-sites/neg_ss_acc_dataset/train.csv', './dataset/splice-sites/neg_ss_acc_dataset/validation.csv', './dataset/splice-sites/neg_ss_acc_dataset/test.csv']
['./dataset/splice-sites/neg_ss_don_dataset/train.csv', './dataset/splice-sites/neg_ss_don_dataset/validation.csv', './dataset/splice-sites/neg_ss_don_dataset/test.csv']


In [7]:
from data_preparation import generate_sample
import pandas as pd

"""
Create sample splice sites training and validation data.
"""
pos_ss_acc_balanced_dir = './dataset/splice-sites/pos_ss_acc_dataset'
pos_ss_don_balanced_dir = './dataset/splice-sites/pos_ss_don_dataset'
neg_ss_acc_balanced_dir = './dataset/splice-sites/neg_ss_acc_dataset'
neg_ss_don_balanced_dir = './dataset/splice-sites/neg_ss_don_dataset'

n_sample = {
    'train': 80,
    'validation': 10,
    'test': 10
}
ss_dir = [pos_ss_acc_balanced_dir, pos_ss_don_balanced_dir, neg_ss_acc_balanced_dir, neg_ss_don_balanced_dir]
ss_file = ['train', 'validation', 'test']

for d in ss_dir:
    for s in ss_file:
        filepath = '{}/{}.csv'.format(d, s)
        targetpath = '{}/{}_sample.csv'.format(d, s)
        df = pd.read_csv(filepath)
        sample_df = df.sample(n=n_sample[s], random_state=1337)
        sample_df.to_csv(targetpath, index=False)

In [8]:
# Sequence length is more than 512 characters so it needs to be expanded.
import pandas as pd
pos_ss_acc_balanced_dir = './dataset/splice-sites/pos_ss_acc_dataset'
pos_ss_don_balanced_dir = './dataset/splice-sites/pos_ss_don_dataset'
neg_ss_acc_balanced_dir = './dataset/splice-sites/neg_ss_acc_dataset'
neg_ss_don_balanced_dir = './dataset/splice-sites/neg_ss_don_dataset'

n_sample = {
    'train': 80,
    'validation': 10,
    'test': 10
}
ss_dir = [pos_ss_acc_balanced_dir, pos_ss_don_balanced_dir, neg_ss_acc_balanced_dir, neg_ss_don_balanced_dir]
ss_file = ['train', 'validation', 'test']

for s in ss_dir:
    for f in ss_file:
        fpath = '{}/{}_sample.csv'.format(s, f)
        gpath = '{}/{}_sample_expanded.csv'.format(s, f)
        if os.path.exists(gpath):
            os.remove(gpath)
        g = open(gpath, 'x')
        g.write('{},{}\n'.format('sequence', 'label'))

        df = pd.read_csv(fpath)
        for i, r in df.iterrows():
            seq = r['sequence']
            label = r['label']
            if len(seq)> 512:
                kmers = [seq[i:i+512] for i in range(len(seq)+1-512)]
                for mer in kmers:
                    g.write('{},{}\n'.format(mer, label))
            else:
                g.write('{},{}\n'.format(seq, label))

        g.close()

In [7]:
"""
Generating promoter dataset from EPD datasets (Dreos et. al., 2013).
`human_tata_data` and `human_non_tata_data` are already expanded.
From each positive and negative set, generate train, validation, and test set.
"""
from data_preparation import generate_datasets, generate_sample
import pandas as pd

human_tata_data = './data/epd/human_tata.csv'
human_non_tata_data = './data/epd/human_non_tata.csv'

human_prom_dataset_dir = './dataset/promoter'
human_prom_pos_dataset_dir = '{}/pos_human_prom_dataset'.format(human_prom_dataset_dir)
human_prom_neg_dataset_dir = '{}/neg_human_prom_dataset'.format(human_prom_dataset_dir)

generate_datasets(human_tata_data, human_prom_pos_dataset_dir)
generate_datasets(human_non_tata_data, human_prom_neg_dataset_dir)

['./dataset/promoter/neg_human_prom_dataset/train.csv',
 './dataset/promoter/neg_human_prom_dataset/validation.csv',
 './dataset/promoter/neg_human_prom_dataset/test.csv']

In [8]:
"""
Generate sample from each train, validation, and test from positive and negative promoter set.
i.e. train.csv => train_sample.csv
"""
from data_preparation import generate_sample
import pandas as pd

n_sample = {
    'train': 80,
    'validation': 10,
    'test': 10
}

human_prom_dataset_dir = './dataset/promoter'
human_prom_pos_dataset_dir = '{}/pos_human_prom_dataset'.format(human_prom_dataset_dir)
human_prom_neg_dataset_dir = '{}/neg_human_prom_dataset'.format(human_prom_dataset_dir)

prom_dir = [human_prom_pos_dataset_dir, human_prom_neg_dataset_dir]
prom_file = ['train', 'validation', 'test']

for p in prom_dir:
    for f in prom_file:
        fpath = '{}/{}.csv'.format(p, f)
        gpath = '{}/{}_sample.csv'.format(p, f)
        generate_sample(fpath, gpath)


In [8]:
"""
Generate sample training and testing data from promoter, splice-sites, and poly-a.
"""
import pandas as pd

dataset_dir = './dataset'
dataset_train_sample = '{}/train_sample.csv'.format(dataset_dir)
dataset_validation_sample = '{}/validation_sample.csv'.format(dataset_dir)
dataset_test_sample = '{}/test_sample.csv'.format(dataset_dir)

prom_dataset_dir = '{}/promoter'.format(dataset_dir)
prom_dataset_pos_dir = '{}/pos_human_prom_dataset'.format(prom_dataset_dir)
prom_dataset_pos_train_sample = '{}/train_sample.csv'.format(prom_dataset_pos_dir)
prom_dataset_pos_validation_sample = '{}/validation_sample.csv'.format(prom_dataset_pos_dir)
prom_dataset_pos_test_sample = '{}/test_sample.csv'.format(prom_dataset_pos_dir)

prom_dataset_neg_dir = '{}/neg_human_prom_dataset'.format(prom_dataset_dir)
prom_dataset_neg_train_sample = '{}/train_sample.csv'.format(prom_dataset_neg_dir)
prom_dataset_neg_validation_sample = '{}/validation_sample.csv'.format(prom_dataset_neg_dir)
prom_dataset_neg_test_sample = '{}/test_sample.csv'.format(prom_dataset_neg_dir)

polya_dataset_dir = '{}/polya'.format(dataset_dir)
polya_dataset_pos_dir = '{}/pos_polya_dataset'.format(polya_dataset_dir)
polya_dataset_pos_train_sample = '{}/train_sample.csv'.format(polya_dataset_pos_dir)
polya_dataset_pos_validation_sample = '{}/validation_sample.csv'.format(polya_dataset_pos_dir)
polya_dataset_pos_test_sample = '{}/test_sample.csv'.format(polya_dataset_pos_dir)

polya_dataset_neg_dir = '{}/neg_polya_dataset'.format(polya_dataset_dir)
polya_dataset_neg_train_sample = '{}/train_sample.csv'.format(polya_dataset_neg_dir)
polya_dataset_neg_validation_sample = '{}/validation_sample.csv'.format(polya_dataset_neg_dir)
polya_dataset_neg_test_sample = '{}/test_sample.csv'.format(polya_dataset_neg_dir)

ss_dataset_dir = '{}/splice-sites'.format(dataset_dir)
ss_dataset_acc_pos_dir = '{}/pos_ss_acc_dataset'.format(ss_dataset_dir)
ss_dataset_acc_pos_train_sample = '{}/train_sample_expanded.csv'.format(ss_dataset_acc_pos_dir)
ss_dataset_acc_pos_validation_sample = '{}/validation_sample_expanded.csv'.format(ss_dataset_acc_pos_dir)
ss_dataset_acc_pos_test_sample = '{}/test_sample_expanded.csv'.format(ss_dataset_acc_pos_dir)

ss_dataset_don_pos_dir = '{}/pos_ss_don_dataset'.format(ss_dataset_dir)
ss_dataset_don_pos_train_sample = '{}/train_sample_expanded.csv'.format(ss_dataset_don_pos_dir)
ss_dataset_don_pos_validation_sample = '{}/validation_sample_expanded.csv'.format(ss_dataset_don_pos_dir)
ss_dataset_don_pos_test_sample = '{}/test_sample_expanded.csv'.format(ss_dataset_don_pos_dir)

ss_dataset_acc_neg_dir = '{}/neg_ss_acc_dataset'.format(ss_dataset_dir)
ss_dataset_acc_neg_train_sample = '{}/train_sample_expanded.csv'.format(ss_dataset_acc_neg_dir)
ss_dataset_acc_neg_validation_sample = '{}/validation_sample_expanded.csv'.format(ss_dataset_acc_neg_dir)
ss_dataset_acc_neg_test_sample = '{}/test_sample_expanded.csv'.format(ss_dataset_acc_neg_dir)

ss_dataset_don_neg_dir = '{}/neg_ss_don_dataset'.format(ss_dataset_dir)
ss_dataset_don_neg_train_sample = '{}/train_sample_expanded.csv'.format(ss_dataset_don_neg_dir)
ss_dataset_don_neg_validation_sample = '{}/validation_sample_expanded.csv'.format(ss_dataset_don_neg_dir)
ss_dataset_don_neg_test_sample = '{}/test_sample_expanded.csv'.format(ss_dataset_don_neg_dir)

columns = ['sequence','label_prom','label_ss', 'label_polya']
header = ','.join(columns)

target_paths = [dataset_train_sample, dataset_validation_sample, dataset_test_sample]
files = ['train_sample', 'validation_sample', 'test_sample']
for t in target_paths:
    if os.path.exists(t):
        os.remove(t)

# Merge promoter.
prom_dfps = [
    (prom_dataset_pos_train_sample, dataset_train_sample),
    (prom_dataset_pos_validation_sample, dataset_validation_sample), 
    (prom_dataset_pos_test_sample, dataset_test_sample),
    (prom_dataset_neg_train_sample, dataset_train_sample),
    (prom_dataset_neg_validation_sample, dataset_validation_sample),
    (prom_dataset_neg_test_sample, dataset_test_sample)
]

ss_dfps = [
    (ss_dataset_acc_pos_train_sample, dataset_train_sample), 
    (ss_dataset_acc_pos_validation_sample, dataset_validation_sample),
    (ss_dataset_acc_pos_test_sample, dataset_test_sample),
    (ss_dataset_acc_neg_train_sample, dataset_train_sample),
    (ss_dataset_acc_neg_validation_sample, dataset_validation_sample),
    (ss_dataset_acc_neg_test_sample, dataset_test_sample),
    (ss_dataset_don_pos_train_sample, dataset_train_sample), 
    (ss_dataset_don_pos_validation_sample, dataset_validation_sample),
    (ss_dataset_don_pos_test_sample, dataset_test_sample),
    (ss_dataset_don_neg_train_sample, dataset_train_sample),
    (ss_dataset_don_neg_validation_sample, dataset_validation_sample),
    (ss_dataset_don_neg_test_sample, dataset_test_sample),
]

polya_dfps = [
    (polya_dataset_pos_train_sample, dataset_train_sample),
    (polya_dataset_pos_validation_sample, dataset_validation_sample),
    (polya_dataset_pos_test_sample, dataset_test_sample),
    (polya_dataset_neg_train_sample, dataset_train_sample),
    (polya_dataset_neg_validation_sample, dataset_validation_sample),
    (polya_dataset_neg_test_sample, dataset_test_sample),
]

def merge_dataset(dfps, target_label=""):
    for dfp in dfps:
        src_path = dfp[0]
        target_path = dfp[1]
        src_df = pd.read_csv(src_path)
        target_df = {}
        if os.path.exists(target_path):
            target_df = pd.read_csv(target_path)
        else:
            target_df = pd.DataFrame(columns=columns)
        for i, r in src_df.iterrows():
            row = {
                'sequence': r['sequence'],
                'label_prom': r['label'] if target_label == "prom" else 0,
                'label_ss': r['label'] if target_label == "ss" else 0,
                'label_polya': r['label'] if target_label == "polya" else 0
            }
            target_df = target_df.append(row, ignore_index=True)
        target_df.to_csv(target_path, index=False)

merge_dataset(prom_dfps, target_label="prom")
merge_dataset(ss_dfps, target_label="ss")
merge_dataset(polya_dfps, target_label="polya")
