In [None]:
"""
Preparation for splice site data.
"""
import os

ss_dir = './data/splice-sites/splice-deep/'
pos_acc_ss_hs = '{}/positive_DNA_seqs_acceptor_hs.fa'.format(ss_dir)
pos_don_ss_hs = '{}/positive_DNA_seqs_donor_hs.fa'.format(ss_dir)
neg_acc_ss_hs = '{}/negative_DNA_seqs_acceptor_hs.fa'.format(ss_dir)
neg_don_ss_hs = '{}/negative_DNA_seqs_donor_hs.fa'.format(ss_dir)

ss_dataset_dir = './dataset/splice-sites'
pos_ss_acc_dataset = '{}/pos_ss_acc_hs.csv'.format(ss_dataset_dir)
pos_ss_don_dataset = '{}/pos_ss_don_hs.csv'.format(ss_dataset_dir)
neg_ss_acc_dataset = '{}/neg_ss_acc_hs.csv'.format(ss_dataset_dir)
neg_ss_don_dataset = '{}/neg_ss_don_hs.csv'.format(ss_dataset_dir)

files = [(pos_acc_ss_hs, 1, 'acc', pos_ss_acc_dataset), 
            (pos_don_ss_hs, 1, 'don', pos_ss_don_dataset), 
            (neg_acc_ss_hs, 0, 'acc', neg_ss_acc_dataset), 
            (neg_don_ss_hs, 0, 'don', neg_ss_don_dataset)]
for p in files:
    fname = p[0]
    label = p[1]
    acc_don = p[2]
    dataset_path = p[3]

    f = {}
    t = {}
    if os.path.exists(dataset_path):
        os.remove(dataset_path)
    try:
        f = open(fname, 'r')
        t = open(dataset_path, 'x')
        t.write('{}\n'.format(','.join(['sequence', 'label'])))

        for line in f:
            line = line.strip()
            #if len(line) > 512:
                #kmers = [line[i:i+512] for i in range(0, len(line)+1-512)] # Make sure everything is 512 character.
                #for mer in kmers:
                #    t.write('{},{}\n'.format(mer, label))        
            #else:
            t.write('{},{}\n'.format(line, label))
        t.close()
        f.close()
    except Exception as e:
        print('Error {}'.format(e))
        t.close()
        f.close()


In [None]:
"""
Create train, validation, and test set for splice site. To do that, the data need to be balance.
If not the sampling based on smallest count is required. Processing is done using pandas.
"""
import pandas as pd

pos_ss_acc_df = pd.read_csv(pos_ss_acc_dataset)
pos_ss_don_df = pd.read_csv(pos_ss_don_dataset)
neg_ss_acc_df = pd.read_csv(neg_ss_acc_dataset)
neg_ss_don_df = pd.read_csv(neg_ss_don_dataset)


In [None]:
# Because loading the dataframe is time consuming, leave the loading at cell above and do later processing here.
pos_ss_acc_size = len(pos_ss_acc_df)
pos_ss_don_size = len(pos_ss_don_df)
neg_ss_acc_size = len(neg_ss_acc_df)
neg_ss_don_size = len(neg_ss_don_df)

count = 0
if pos_ss_acc_size == pos_ss_don_size == neg_ss_acc_size == neg_ss_don_size:
    print('dataset balance')
    count = pos_ss_acc_size
else:
    print('dataset imbalance')
    print('pos acc {}\npos don {}\nneg acc {}\nneg don {}'.format(pos_ss_acc_size, pos_ss_don_size, neg_ss_acc_size, neg_ss_don_size))
    count = min([pos_ss_acc_size, pos_ss_don_size, neg_ss_acc_size, neg_ss_don_size])
    print('count = {}'.format(count))

pos_ss_acc_df_sample = pos_ss_acc_df.sample(n=count, replace=False, random_state=1337)
pos_ss_don_df_sample = pos_ss_don_df.sample(n=count, replace=False, random_state=1337)
neg_ss_acc_df_sample = neg_ss_acc_df.sample(n=count, replace=False, random_state=1337)
neg_ss_don_df_sample = neg_ss_don_df.sample(n=count, replace=False, random_state=1337)

In [None]:
pos_ss_acc_balanced = './dataset/splice-sites/pos_ss_acc_balanced.csv'
pos_ss_don_balanced = './dataset/splice-sites/pos_ss_don_balanced.csv'
neg_ss_acc_balanced = './dataset/splice-sites/neg_ss_acc_balanced.csv'
neg_ss_don_balanced = './dataset/splice-sites/neg_ss_don_balanced.csv'

pos_ss_acc_df_sample.to_csv(pos_ss_acc_balanced, index=False)
pos_ss_don_df_sample.to_csv(pos_ss_don_balanced, index=False)
neg_ss_acc_df_sample.to_csv(neg_ss_acc_balanced, index=False)
neg_ss_don_df_sample.to_csv(neg_ss_don_balanced, index=False)

In [None]:
from data_preparation import generate_datasets

pos_ss_acc_balanced_dir = './dataset/splice-sites/pos_ss_acc_dataset'
pos_ss_don_balanced_dir = './dataset/splice-sites/pos_ss_don_dataset'
neg_ss_acc_balanced_dir = './dataset/splice-sites/neg_ss_acc_dataset'
neg_ss_don_balanced_dir = './dataset/splice-sites/neg_ss_don_dataset'

dirs = [pos_ss_acc_balanced_dir, pos_ss_don_balanced_dir, neg_ss_acc_balanced_dir, neg_ss_don_balanced_dir]
for d in dirs:
    if os.path.exists(d):
        os.mkdir(d)

pos_ss_acc_dataset = generate_datasets(pos_ss_acc_balanced, pos_ss_acc_balanced_dir)
pos_ss_don_dataset = generate_datasets(pos_ss_don_balanced, pos_ss_don_balanced_dir)
neg_ss_acc_dataset = generate_datasets(neg_ss_acc_balanced, neg_ss_acc_balanced_dir)
neg_ss_don_dataset = generate_datasets(neg_ss_don_balanced, neg_ss_don_balanced_dir)

print(pos_ss_acc_dataset)
print(pos_ss_don_dataset)
print(neg_ss_acc_dataset)
print(neg_ss_don_dataset)

In [None]:
from data_preparation import generate_sample
import pandas as pd

"""
Create sample splice sites training and validation data.
"""
pos_ss_acc_balanced_dir = './dataset/splice-sites/pos_ss_acc_dataset'
pos_ss_don_balanced_dir = './dataset/splice-sites/pos_ss_don_dataset'
neg_ss_acc_balanced_dir = './dataset/splice-sites/neg_ss_acc_dataset'
neg_ss_don_balanced_dir = './dataset/splice-sites/neg_ss_don_dataset'

n_sample = {
    'train': 80,
    'validation': 10,
    'test': 10
}
ss_dir = [pos_ss_acc_balanced_dir, pos_ss_don_balanced_dir, neg_ss_acc_balanced_dir, neg_ss_don_balanced_dir]
ss_file = ['train', 'validation', 'test']

for d in ss_dir:
    for s in ss_file:
        filepath = '{}/{}.csv'.format(d, s)
        targetpath = '{}/{}_sample.csv'.format(d, s)
        df = pd.read_csv(filepath)
        sample_df = df.sample(n=n_sample[s], random_state=1337)
        sample_df.to_csv(targetpath, index=False)

In [None]:
# Sequence length is more than 512 characters so it needs to be expanded.
import pandas as pd
pos_ss_acc_balanced_dir = './dataset/splice-sites/pos_ss_acc_dataset'
pos_ss_don_balanced_dir = './dataset/splice-sites/pos_ss_don_dataset'
neg_ss_acc_balanced_dir = './dataset/splice-sites/neg_ss_acc_dataset'
neg_ss_don_balanced_dir = './dataset/splice-sites/neg_ss_don_dataset'

n_sample = {
    'train': 80,
    'validation': 10,
    'test': 10
}
ss_dir = [pos_ss_acc_balanced_dir, pos_ss_don_balanced_dir, neg_ss_acc_balanced_dir, neg_ss_don_balanced_dir]
ss_file = ['train', 'validation', 'test']

for s in ss_dir:
    for f in ss_file:
        fpath = '{}/{}_sample.csv'.format(s, f)
        gpath = '{}/{}_sample_expanded.csv'.format(s, f)
        if os.path.exists(gpath):
            os.remove(gpath)
        g = open(gpath, 'x')
        g.write('{},{}\n'.format('sequence', 'label'))

        df = pd.read_csv(fpath)
        for i, r in df.iterrows():
            seq = r['sequence']
            label = r['label']
            if len(seq)> 512:
                kmers = [seq[i:i+512] for i in range(len(seq)+1-512)]
                for mer in kmers:
                    g.write('{},{}\n'.format(mer, label))
            else:
                g.write('{},{}\n'.format(seq, label))

        g.close()