In [1]:
"""
Generate tokenized and expanded datasets.
Data from sample or data folder are not expanded (shouldn't be expanded).
Data will be expanded when they are moving to dataset folder.
Here, we will be preparing samples. 
"""
from data_dir import dataset_sample_polya_dir, dataset_sample_prom_dir, dataset_sample_ss_dir, sample_ss_dir, sample_prom_dir, sample_polya_dir
from data_preparation import expand_by_sliding_window

dirs = [(sample_prom_dir, dataset_sample_prom_dir), (sample_ss_dir, dataset_sample_ss_dir), (sample_polya_dir, dataset_sample_polya_dir)]
for src_dir, target_dir in dirs:
    print("Generate expanded training data: {}".format(expand_by_sliding_window("{}/train.csv".format(src_dir), "{}/train.csv".format(target_dir))))
    print("Generate expanded training data: {}".format(expand_by_sliding_window("{}/validation.csv".format(src_dir), "{}/validation.csv".format(target_dir))))
    print("Generate expanded training data: {}".format(expand_by_sliding_window("{}/test.csv".format(src_dir), "{}/test.csv".format(target_dir))))


Generate expanded training data: Truerain.csv: 330/330
Generate expanded training data: Truealidation.csv: 42/42
Generate expanded training data: Trueest.csv: 42/42
Generate expanded training data: Truees/train.csv: 40/40
Generate expanded training data: Truees/validation.csv: 40/40
Generate expanded training data: Truees/test.csv: 40/40
Processing source ./sample/polya/train.csv: 275/404

In [2]:
"""
Generate tokenized and expanded dataset from polya data. 
This script is created becaused the cell above failed to work due to python kernel crash.
"""
from data_dir import dataset_sample_polya_dir, dataset_sample_prom_dir, dataset_sample_ss_dir, sample_ss_dir, sample_prom_dir, sample_polya_dir
from data_preparation import expand_by_sliding_window

dirs = [(sample_polya_dir, dataset_sample_polya_dir)]
for src_dir, target_dir in dirs:
    print("Generate expanded training data: {}".format(expand_by_sliding_window("{}/train.csv".format(src_dir), "{}/train.csv".format(target_dir))))
    print("Generate expanded validation data: {}".format(expand_by_sliding_window("{}/validation.csv".format(src_dir), "{}/validation.csv".format(target_dir))))
    print("Generate expanded test data: {}".format(expand_by_sliding_window("{}/test.csv".format(src_dir), "{}/test.csv".format(target_dir))))

Generate expanded training data: Truen.csv: 404/404
Generate expanded validation data: Truetion.csv: 50/50
Generate expanded test data: Truetest.csv: 50/50


In [1]:
"""
Merge promoter, ss, and polya into single datasets containing train.csv, validation.csv, and test.csv.
"""
from data_dir import dataset_sample_prom_dir, dataset_sample_ss_dir, dataset_sample_polya_dir, dataset_sample_dir
from data_preparation import merge_dataset

print("Merging dataset sample {}".format(merge_dataset(dataset_sample_prom_dir, dataset_sample_ss_dir, dataset_sample_polya_dir, dataset_sample_dir)))

Merging dataset sample True


In [1]:
"""
Merge positive and negative csv for promoter, ss, and polya into single files and into their respective directory.
"""
from data_dir import epd_neg_tata_kmer_dir, epd_pos_tata_kmer_dir, data_epd_dir
from data_dir import polya_grch38_negative_dir, polya_grch38_positive_dir, polya_grch38_dir

from data_preparation import merge_csv
_files = ["train.csv", 'validation.csv', 'test.csv']
for fname in _files:
    neg_file = "{}/{}".format(epd_neg_tata_kmer_dir, fname)
    pos_file = "{}/{}".format(epd_pos_tata_kmer_dir, fname)
    target_file = "{}/{}".format(data_epd_dir, fname)
    print("Promoter: Merging <{}> & <{}>: {}".format(neg_file, pos_file, merge_csv([neg_file, pos_file], target_file)))

    pos_file = "{}/{}".format(polya_grch38_positive_dir, fname)
    neg_file = "{}/{}".format(polya_grch38_negative_dir, fname)
    target_file = "{}/{}".format(polya_grch38_dir, fname)
    print("Poly A:  Merging <{}> & <{}>: {}".format(neg_file, pos_file, merge_csv([neg_file, pos_file], target_file)))
#endfor


Promoter: Merging <./data/epd/human_non_tata_kmer/train.csv> & <./data/epd/human_tata_kmer/train.csv>: True
Poly A:  Merging <./data/poly-a/grch38/negative/train.csv> & <./data/poly-a/grch38/positive/train.csv>: True
Promoter: Merging <./data/epd/human_non_tata_kmer/validation.csv> & <./data/epd/human_tata_kmer/validation.csv>: True
Poly A:  Merging <./data/poly-a/grch38/negative/validation.csv> & <./data/poly-a/grch38/positive/validation.csv>: True
Promoter: Merging <./data/epd/human_non_tata_kmer/test.csv> & <./data/epd/human_tata_kmer/test.csv>: True
Poly A:  Merging <./data/poly-a/grch38/negative/test.csv> & <./data/poly-a/grch38/positive/test.csv>: True


In [6]:
"""
Process Poly-A data by expanding tokenized data from original directory into dataset directory.
"""
from data_dir import polya_grch38_dir, dataset_full_polya_dir
from data_preparation import expand_by_sliding_window

_files = ['train.csv', 'validation.csv', 'test.csv']
for fname in _files:
    src_csv = "{}/{}".format(polya_grch38_dir, fname)
    target_csv = "{}/{}".format(dataset_full_polya_dir, fname)
    print("Expanding source {} => {}: {} ".format(src_csv, target_csv, expand_by_sliding_window(src_csv, target_csv, length=510)))
#endfor

Processing source ./data/poly-a/grch38/train.csv: 1000/4032

In [None]:
"""
Expand Splice-sites and store the result into dataset folder.
"""
from data_dir import ss_dir, dataset_full_ss_dir
from data_preparation import expand_by_sliding_window
_files = ["train.csv", 'validation.csv', 'test.csv']
for fname in _files:
    src_csv = "{}/{}".format(ss_dir, fname)
    target_csv = "{}/{}".format(dataset_full_ss_dir, fname)
    print("Expanding {} => {}: {}".format(src_csv, target_csv, expand_by_sliding_window(src_csv, target_csv, length=510)))


Expanding ./data/splice-sites/splice-deep/train.csv => ./dataset/full/polya/train.csv: True
Expanding ./data/splice-sites/splice-deep/validation.csv => ./dataset/full/polya/validation.csv: True
Expanding ./data/splice-sites/splice-deep/test.csv => ./dataset/full/polya/test.csv: True


In [4]:
"""
Expand Poly-A datasets (training, validation, and testing) and store the expanded datasets into dataset folder.
"""
from data_dir import polya_grch38_dir, dataset_full_polya_dir, _generic_filenames
from data_preparation import expand_by_sliding_window_no_pandas

for fname in _generic_filenames:
    src_path = "{}/{}".format(polya_grch38_dir, fname)
    target_path = "{}/{}".format(dataset_full_polya_dir, fname)
    print("Expanding {} => {}: {}".format(src_path, target_path, expand_by_sliding_window_no_pandas(src_path, target_path, length=510)))

Expanding ./data/poly-a/grch38/train.csv => ./dataset/full/polya/train.csv: True
Expanding ./data/poly-a/grch38/validation.csv => ./dataset/full/polya/validation.csv: True
Expanding ./data/poly-a/grch38/test.csv => ./dataset/full/polya/test.csv: True


In [1]:
"""
Expand train, validation, and test data into 510 tokens for each sequence and store the sequence into dataset folder.
Since these are not samples then use `./dataset/full/promoter` folder.
"""
from data_dir import epd_train_csv, polya_grch38_train_csv, ss_train_csv, dataset_full_prom_train_csv, dataset_full_ss_train_csv, dataset_full_polya_train_csv  
from data_preparation import expand_by_sliding_window_no_pandas

_files = [(epd_train_csv, dataset_full_prom_train_csv), (polya_grch38_train_csv, dataset_full_polya_train_csv), (ss_train_csv, dataset_full_ss_train_csv)]
for src, target in _files:
    print("Expanding {} => {}: {}".format(src, target, expand_by_sliding_window_no_pandas(src, target, length=510)))



Expanding ./data/epd/train.csv => ./dataset/full/promoter/train.csv: True
Expanding ./data/poly-a/grch38/train.csv => ./dataset/full/polya/train.csv: True
Expanding ./data/splice-sites/splice-deep/train.csv => ./dataset/full/splice-sites/train.csv: True


In [2]:
"""
Merge promoter, poly A, and splice sites data.
"""
from data_dir import dataset_full_prom_dir, dataset_full_ss_dir, dataset_full_polya_dir, dataset_full_dir
from data_preparation import merge_dataset

print("Merging all datasets: {}".format(merge_dataset(dataset_full_prom_dir, dataset_full_ss_dir, dataset_full_polya_dir, dataset_full_dir, file_to_merge=['train.csv'])))

Merging all datasets: True
