In [1]:
"""
Positive and negative acceptor and donor are available by some previous processes. 
Generate kmer version of those data.
"""
from data_preparation import generate_kmer_csv
from data_dir import ss_pos_acc_hs_csv, ss_pos_don_hs_csv, ss_neg_acc_hs_csv, ss_neg_don_hs_csv, ss_pos_acc_hs_kmer_csv, ss_pos_don_hs_kmer_csv, ss_neg_acc_hs_kmer_csv, ss_neg_don_hs_kmer_csv

file_pairs = [
    (ss_pos_acc_hs_csv, ss_pos_acc_hs_kmer_csv),
    (ss_pos_don_hs_csv, ss_pos_don_hs_kmer_csv),
    (ss_neg_acc_hs_csv, ss_neg_acc_hs_kmer_csv),
    (ss_neg_don_hs_csv, ss_neg_don_hs_kmer_csv)
]

for src, target_dir in file_pairs:
    generate_kmer_csv(src, target_dir)

In [1]:
"""
For each positive and negative data, split into three parts: train, validation, and test.
"""

from data_dir import ss_pos_acc_csv, ss_pos_don_csv, ss_neg_acc_csv, ss_neg_don_csv, ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir
from data_preparation import generate_datasets

print("Generate train, validation, and test data from {}: {}".format(ss_pos_acc_csv, generate_datasets(ss_pos_acc_csv, ss_pos_acc_dir)))
print("Generate train, validation, and test data from {}: {}".format(ss_pos_don_csv, generate_datasets(ss_pos_don_csv, ss_pos_don_dir)))
print("Generate train, validation, and test data from {}: {}".format(ss_pos_acc_csv, generate_datasets(ss_neg_acc_csv, ss_neg_acc_dir)))
print("Generate train, validation, and test data from {}: {}".format(ss_pos_acc_csv, generate_datasets(ss_neg_don_csv, ss_neg_don_dir)))

Generate train, validation, and test data from ./data/splice-sites/splice-deep/pos_ss_acc_hs.csv: ['./data/splice-sites/splice-deep/pos_acc/train.csv', './data/splice-sites/splice-deep/pos_acc/validation.csv', './data/splice-sites/splice-deep/pos_acc/test.csv']
Generate train, validation, and test data from ./data/splice-sites/splice-deep/pos_ss_don_hs.csv: ['./data/splice-sites/splice-deep/pos_don/train.csv', './data/splice-sites/splice-deep/pos_don/validation.csv', './data/splice-sites/splice-deep/pos_don/test.csv']
Generate train, validation, and test data from ./data/splice-sites/splice-deep/pos_ss_acc_hs.csv: ['./data/splice-sites/splice-deep/neg_acc/train.csv', './data/splice-sites/splice-deep/neg_acc/validation.csv', './data/splice-sites/splice-deep/neg_acc/test.csv']
Generate train, validation, and test data from ./data/splice-sites/splice-deep/pos_ss_acc_hs.csv: ['./data/splice-sites/splice-deep/neg_don/train.csv', './data/splice-sites/splice-deep/neg_don/validation.csv', './d

In [1]:
"""
Merge each training data. Store them in original folder. Don't create for dataset folder yet.
"""
from data_dir import ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir, ss_train_csv, ss_validation_csv, ss_test_csv
from data_preparation import merge_csv

print("Generating training data: {}".format(merge_csv(["{}/train.csv".format(a) for a in [ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir]], ss_train_csv)))
print("Generating validation data: {}".format(merge_csv(["{}/validation.csv".format(a) for a in [ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir]], ss_validation_csv)))
print("Generating test data: {}".format(merge_csv(["{}/test.csv".format(a) for a in [ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir]], ss_test_csv)))

Generating training data: True
Generating validation data: True
Generating test data: True


In [2]:
"""
Expand training, validation, and test data if sequence has more than 512 characters.
"""
from data_dir import ss_train_csv, ss_validation_csv, ss_test_csv, dataset_ss_train_csv, dataset_ss_validation_csv, dataset_ss_test_csv
from data_preparation import expand_by_sliding_window

print("Generate expanded training data: {}".format(expand_by_sliding_window(ss_train_csv, dataset_ss_train_csv)))
print("Generate expanded validation data: {}".format(expand_by_sliding_window(ss_validation_csv, dataset_ss_validation_csv)))
print("Generate expanded test data: {}".format(expand_by_sliding_window(ss_test_csv, dataset_ss_test_csv)))


Generate expanded training data: True/splice-deep/train.csv: 797680/797680
Generate expanded validation data: Trueplice-deep/validation.csv: 99710/99710
Generate expanded test data: Trueites/splice-deep/test.csv: 99710/99710


In [1]:
"""
Preparing samples.
"""
from data_dir import ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir
from data_preparation import generate_sample

# First, create samples.
for dir in [ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir]:
    files = [('train.csv', 'train_sample.csv'), ('validation.csv', 'validation_sample.csv'), ('test.csv', 'test_sample.csv')]
    paths = [("{}/{}".format(dir, a[0]), "{}/{}".format(dir, a[1])) for a in files]
    for path in paths:
        generate_sample(path[0], path[1])

Generate sample for ./data/splice-sites/splice-deep/pos_acc/train.csv => ./data/splice-sites/splice-deep/pos_acc/train_sample.csv
Generate sample for ./data/splice-sites/splice-deep/pos_acc/validation.csv => ./data/splice-sites/splice-deep/pos_acc/validation_sample.csv
Generate sample for ./data/splice-sites/splice-deep/pos_acc/test.csv => ./data/splice-sites/splice-deep/pos_acc/test_sample.csv
Generate sample for ./data/splice-sites/splice-deep/pos_don/train.csv => ./data/splice-sites/splice-deep/pos_don/train_sample.csv
Generate sample for ./data/splice-sites/splice-deep/pos_don/validation.csv => ./data/splice-sites/splice-deep/pos_don/validation_sample.csv
Generate sample for ./data/splice-sites/splice-deep/pos_don/test.csv => ./data/splice-sites/splice-deep/pos_don/test_sample.csv
Generate sample for ./data/splice-sites/splice-deep/neg_acc/train.csv => ./data/splice-sites/splice-deep/neg_acc/train_sample.csv
Generate sample for ./data/splice-sites/splice-deep/neg_acc/validation.csv

In [None]:
from data_dir import ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir, sample_ss_validation_csv, sample_ss_train_csv, sample_ss_test_csv
from data_preparation import merge_csv

# Second, merge samples and store in ./sample/splice-sites folder.
file_set = [('train_sample.csv', sample_ss_train_csv) , ('validation_sample.csv', sample_ss_validation_csv), ('test_sample.csv', sample_ss_test_csv)]
for file in file_set:
    dirs = [ss_pos_acc_dir, ss_pos_don_dir, ss_neg_acc_dir, ss_neg_don_dir]
    files = ["{}/{}".format(dir, file[0]) for dir in dirs]
    merge_csv(files, file[1])

In [None]:
"""
FROM HERE AND ON, SCRIPTS ARE NOT WRITTEN IN FORMAL ALGORITHM AND ARE WRITTEN TO RESPONSE CERTAIN SITUATION.
Convert existing splice site train, validation, and test into kmer version.
"""
from data_preparation import generate_kmer_csv
from data_dir import sample_ss_dir

import os

os.rename("{}/train.csv".format(sample_ss_dir), "{}/train_no_kmer.csv".format(sample_ss_dir))
os.rename("{}/validation.csv".format(sample_ss_dir), "{}/validation_no_kmer.csv".format(sample_ss_dir))
os.rename("{}/test.csv".format(sample_ss_dir), "{}/test_no_kmer.csv".format(sample_ss_dir))


In [3]:
from data_preparation import generate_kmer_csv
from data_dir import sample_ss_dir

original_csvs = [
    ("{}/train_no_kmer.csv".format(sample_ss_dir), "{}/train_kmer.csv".format(sample_ss_dir)),
    ("{}/validation_no_kmer.csv".format(sample_ss_dir),"{}/validation_kmer.csv".format(sample_ss_dir)),
    ("{}/test_no_kmer.csv".format(sample_ss_dir), "{}/test_kmer.csv".format(sample_ss_dir))
]
for src_csv, target_csv in original_csvs:
    generate_kmer_csv(src_csv, target_csv)

Generating kmer for <./sample/splice-sites/test_no_kmer.csv>: 40/40 40/40

In [4]:
from data_preparation import expand_by_sliding_window
from data_dir import sample_ss_dir

original_csvs = [
    ("{}/train_kmer.csv".format(sample_ss_dir), "{}/train.csv".format(sample_ss_dir)),
    ("{}/validation_kmer.csv".format(sample_ss_dir),"{}/validation.csv".format(sample_ss_dir)),
    ("{}/test_kmer.csv".format(sample_ss_dir), "{}/test.csv".format(sample_ss_dir))
]
for src_csv, target_csv in original_csvs:
    expand_by_sliding_window(src_csv, target_csv)

Processing source ./sample/splice-sites/test_kmer.csv: 40/40 40/40