In [4]:
"""Convert fasta to sequence with label in CSV format"""

import os
import pandas as pd
path = os.path.join("data", "splice-sites", "splice-deep", "raw")
target_path = os.path.join("data", "splice-sites", "splice-deep")
positives = [
    os.path.join(path, "positive_DNA_seqs_acceptor_hs.fa"),
    os.path.join(path, "positive_DNA_seqs_donor_hs.fa")
]
for p in positives:
    bname = os.path.basename(p)
    tname = os.path.join(target_path, f"{bname.split('.')[0]}.csv")
    if os.path.exists(tname):
        os.remove(tname)
    else:
        os.makedirs(os.path.dirname(tname), exist_ok=True)
    df = pd.DataFrame(columns=["sequence", "label"])
    f = open(p, "r")
    t = open(tname, "x")
    t.write("sequence,label\n")
    for line in f:
        t.write(f"{line.strip()},{1}\n")
    f.close()
    t.close()

import os
import pandas as pd
path = os.path.join("data", "splice-sites", "splice-deep", "raw")
target_path = os.path.join("data", "splice-sites", "splice-deep")

negatives = [
    os.path.join(path, "negative_DNA_seqs_acceptor_hs.fa"),
    os.path.join(path, "negative_DNA_seqs_donor_hs.fa")
]
for p in negatives:
    bname = os.path.basename(p)
    tname = os.path.join(target_path, f"{bname.split('.')[0]}.csv")
    if os.path.exists(tname):
        os.remove(tname)
    else:
        os.makedirs(os.path.dirname(tname), exist_ok=True)
    df = pd.DataFrame(columns=["sequence", "label"])
    f = open(p, "r")
    t = open(tname, "x")
    t.write("sequence,label\n")
    for line in f:
        t.write(f"{line.strip()},{0}\n")
    f.close()
    t.close()

In [1]:
"""Creating 512-char-chunks from single sequence"""

import os
from data_dir import ss_dir
import pandas as pd
from tqdm import tqdm
from data_preparation import str_kmer

positives = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.csv"),
    os.path.join(ss_dir, "positive_DNA_seqs_donor_hs.csv"),
]

positives_chunk = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "positive_DNA_seqs_donor_hs.expanded.csv"),
]

for p, pc in zip(positives, positives_chunk):
    bname = os.path.basename(p)
    dest_path = pc
    if os.path.exists(dest_path):
        os.remove(dest_path)
    else:
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    dest = open(dest_path, "x")
    dest.write(f"sequence,label\n")
    df = pd.read_csv(p)
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc=f"{bname}"):
        s = list(r["sequence"])
        label = r["label"]
        ln = len(s)
        delta = ln-512
        s_left  = s[0:ln-delta]
        s_middle = s[int(delta/2):ln-int(delta/2)]
        s_right = s[delta:ln]
        dest.write(f"{''.join(s_left)},{label}\n")
        dest.write(f"{''.join(s_middle)},{label}\n")
        dest.write(f"{''.join(s_right)},{label}\n")
    #endfor
    dest.close()


positive_DNA_seqs_acceptor_hs.csv: 100%|██████████| 248150/248150 [00:27<00:00, 9070.25it/s] 
positive_DNA_seqs_donor_hs.csv: 100%|██████████| 250400/250400 [00:26<00:00, 9438.57it/s] 


In [3]:
"""Creating 512-char-chunks from single sequence"""

negatives = [
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_donor_hs.csv"),
]

negatives_chunk = [
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_donor_hs.expanded.csv"),
]

for p, pc in zip(negatives, negatives_chunk):
    bname = os.path.basename(p)
    dest_path = pc
    if os.path.exists(dest_path):
        os.remove(dest_path)
    else:
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    dest = open(dest_path, "x")
    dest.write(f"sequence,label\n")
    df = pd.read_csv(p)
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc=f"{bname}"):
        s = list(r["sequence"])
        label = r["label"]
        ln = len(s)
        delta = ln-512
        s_left  = s[0:ln-delta]
        s_middle = s[int(delta/2):ln-int(delta/2)]
        s_right = s[delta:ln]
        dest.write(f"{''.join(s_left)},{label}\n")
        dest.write(f"{''.join(s_middle)},{label}\n")
        dest.write(f"{''.join(s_right)},{label}\n")
    #endfor
    dest.close()

negative_DNA_seqs_acceptor_hs.csv: 100%|██████████| 248150/248150 [00:26<00:00, 9281.63it/s]
negative_DNA_seqs_donor_hs.csv: 100%|██████████| 250400/250400 [00:26<00:00, 9465.24it/s] 


In [1]:
"""
Merge positives and negatives
"""
import os
from data_dir import ss_dir
negatives_chunk = [
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_donor_hs.expanded.csv"),
]

positives_chunk = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "positive_DNA_seqs_donor_hs.expanded.csv"),
]

from data_preparation import merge_csv
positive = os.path.join(ss_dir, "positive_DNA_seqs.csv")
negative = os.path.join(ss_dir, "negative_DNA_seqs.csv")

merge_csv(positives_chunk, positive)
merge_csv(negatives_chunk, negative)


100%|██████████| 2/2 [00:21<00:00, 10.99s/it]
100%|██████████| 2/2 [00:25<00:00, 12.69s/it]


True

In [4]:
"""Create balanced dataset based on promoter size 6943."""
import os
from data_dir import ss_dir
positive = os.path.join(ss_dir, "positive_DNA_seqs.csv")
negative = os.path.join(ss_dir, "negative_DNA_seqs.csv")
positive_balanced = os.path.join(ss_dir, "positive_DNA_seqs.balanced.csv")
negative_balanced = os.path.join(ss_dir, "negative_DNA_seqs.balanced.csv")

import pandas as pd
positive_balanced_df = pd.read_csv(positive).sample(n=6943)
positive_balanced_df.to_csv(positive_balanced, index=False)
negative_balanced_df = pd.read_csv(negative).sample(n=6943)
negative_balanced_df.to_csv(negative_balanced, index=False)


In [5]:
"""Split positive balanced and negative balanced into train and validation"""
from data_preparation import split_csv
from data_dir import ss_dir
positive_balanced = os.path.join(ss_dir, "positive_DNA_seqs.balanced.csv")
positive_balanced_train = os.path.join(ss_dir, "positive_DNA_seqs.balanced.train.csv")
positive_balanced_validation = os.path.join(ss_dir, "positive_DNA_seqs.balanced.validation.csv")

negative_balanced = os.path.join(ss_dir, "negative_DNA_seqs.balanced.csv")
negative_balanced_train = os.path.join(ss_dir, "negative_DNA_seqs.balanced.train.csv")
negative_balanced_validation = os.path.join(ss_dir, "negative_DNA_seqs.balanced.validation.csv")

split_csv(positive_balanced, [0.8, 0.2], [positive_balanced_train, positive_balanced_validation])
split_csv(negative_balanced, [0.8, 0.2], [negative_balanced_train, negative_balanced_validation])

In [6]:
"""
Merge training and validation
"""
from data_preparation import merge_csv
from data_dir import ss_dir
positive_balanced_train = os.path.join(ss_dir, "positive_DNA_seqs.balanced.train.csv")
positive_balanced_validation = os.path.join(ss_dir, "positive_DNA_seqs.balanced.validation.csv")
negative_balanced_train = os.path.join(ss_dir, "negative_DNA_seqs.balanced.train.csv")
negative_balanced_validation = os.path.join(ss_dir, "negative_DNA_seqs.balanced.validation.csv")
ss_train_balanced = os.path.join(ss_dir, "ss_train.balanced.csv")
ss_validation_balanced = os.path.join(ss_dir, "ss_validation.balanced.csv")

merge_csv([positive_balanced_train, negative_balanced_train], ss_train_balanced)
merge_csv([positive_balanced_validation, negative_balanced_validation], ss_validation_balanced)

100%|██████████| 2/2 [00:00<00:00, 13.70it/s]
100%|██████████| 2/2 [00:00<00:00, 43.48it/s]


True

In [8]:
"""
Generate kmer format.
"""
import os
from data_dir import ss_dir
positives = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "positive_DNA_seqs_donor_hs.expanded.csv"),
]
negatives = [
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_donor_hs.expanded.csv"),
]

from data_preparation import generate_kmer_csv
for p in positives:
    bname = os.path.basename(p)
    target_path = os.path.join(ss_dir, f"{bname.split('.')[0]}.expanded.kmer.csv")
    generate_kmer_csv(p, target_path)
for n in negatives:
    bname = os.path.basename(n)
    target_path = os.path.join(ss_dir, f"{bname.split('.')[0]}.expanded.kmer.csv")
    generate_kmer_csv(n, target_path)

Generating kmer for <data\splice-sites/splice-deep\negative_DNA_seqs_donor_hs.expanded.csv>: 751200/751200                                                         

In [5]:
"""
Split into training and validation data.
"""
import os
from data_dir import ss_dir
positives = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "positive_DNA_seqs_donor_hs.expanded.csv"),
]

negatives = [
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.expanded.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_donor_hs.expanded.csv"),
]

positives_negatives = [positives, negatives]

for pn in positives_negatives:
    for p in pn:
        df = pd.read_csv(p)
        train_df = df.sample(frac=0.8)
        validation_df = df.drop(train_df.index)
        train_df.to_csv(
            os.path.join(ss_dir, f"{os.path.basename(p).split('.')[0]}.train.csv"),
            index=False
        )
        validation_df.to_csv(
            os.path.join(ss_dir, f"{os.path.basename(p).split('.')[0]}.validation.csv"),
            index=False
        )

In [1]:
"""
Merge training and validation.
"""
from data_preparation import merge_csv
from data_dir import ss_dir
import os
trains = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.train.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.train.csv")
]
validations = [
    os.path.join(ss_dir, "positive_DNA_seqs_acceptor_hs.validation.csv"),
    os.path.join(ss_dir, "negative_DNA_seqs_acceptor_hs.validation.csv")
]
print(f'Merging training data: {merge_csv(trains, os.path.join(ss_dir, "ss_train.csv"))}')
print(f'Merging validation data: {merge_csv(validations, os.path.join(ss_dir, "ss_validation.csv"))}')

100%|██████████| 2/2 [00:10<00:00,  5.39s/it]


Merging training data: True


100%|██████████| 2/2 [00:02<00:00,  1.20s/it]

Merging validation data: True





In [1]:
"""
Positive and negative acceptor and donor are available by some previous processes. 
Generate kmer version of those data.
"""
from data_preparation import generate_kmer_csv
from data_dir import ss_pos_acc_hs_csv, ss_pos_don_hs_csv, ss_neg_acc_hs_csv, ss_neg_don_hs_csv, ss_pos_acc_hs_kmer_csv, ss_pos_don_hs_kmer_csv, ss_neg_acc_hs_kmer_csv, ss_neg_don_hs_kmer_csv

file_pairs = [
    (ss_pos_acc_hs_csv, ss_pos_acc_hs_kmer_csv),
    (ss_pos_don_hs_csv, ss_pos_don_hs_kmer_csv),
    (ss_neg_acc_hs_csv, ss_neg_acc_hs_kmer_csv),
    (ss_neg_don_hs_csv, ss_neg_don_hs_kmer_csv)
]

for src, target_dir in file_pairs:
    generate_kmer_csv(src, target_dir)

Error File workspace\ss\pos_ss_acc_hs.csv not found.
Error File workspace\ss\pos_ss_don_hs.csv not found.
Error File workspace\ss\neg_ss_acc_hs.csv not found.
Error File workspace\ss\neg_ss_don_hs.csv not found.


In [3]:
"""
Generate kmer version of splice site data.
"""
from data_dir import ss_dir
from data_preparation import generate_kmer_csv
src_train = "{}/train_no_kmer.csv".format(ss_dir)
src_validation = "{}/validation_no_kmer.csv".format(ss_dir)
src_test = "{}/test_no_kmer.csv".format(ss_dir)

target_train = "{}/train.csv".format(ss_dir)
target_validation = "{}/validation.csv".format(ss_dir)
target_test = "{}/test.csv".format(ss_dir)

_pairs = [(src_validation, target_validation), (src_test, target_test)]
#for src, target in _pairs:
#    print("Generate kmer csv for {} => {}: {}".format(src, target, generate_kmer_csv(src, target)))

In [4]:
_pairs = [(src_validation, target_validation), (src_test, target_test)]
for src, target in _pairs:
    print("Generate kmer csv for {} => {}: {}".format(src, target, generate_kmer_csv(src, target)))

Generate kmer csv for ./data/splice-sites/splice-deep/validation_no_kmer.csv => ./data/splice-sites/splice-deep/validation.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/test_no_kmer.csv => ./data/splice-sites/splice-deep/test.csv: True


In [1]:
"""
Expand Splice-sites and store the result into dataset folder.
"""
from data_dir import ss_dir, dataset_full_ss_dir
from data_preparation import expand_by_sliding_window_no_pandas
_files = ["train.csv", 'validation.csv', 'test.csv']
for fname in _files:
    src_csv = "{}/{}".format(ss_dir, fname)
    target_csv = "{}/{}".format(dataset_full_ss_dir, fname)
    print("Expanding {} => {}: {}".format(src_csv, target_csv, expand_by_sliding_window_no_pandas(src_csv, target_csv, length=510)))

Error [Errno 28] No space left on device
Error Traceback (most recent call last):
  File "w:\Research\_sequence-processing\data_preparation.py", line 1034, in expand_by_sliding_window_no_pandas
    if _count < _len:
OSError: [Errno 28] No space left on device

Expanding ./data/splice-sites/splice-deep/train.csv => ./dataset/full/splice-sites/train.csv: False
Error [Errno 28] No space left on device
Error Traceback (most recent call last):
  File "w:\Research\_sequence-processing\data_preparation.py", line 1034, in expand_by_sliding_window_no_pandas
    if _count < _len:
OSError: [Errno 28] No space left on device

Expanding ./data/splice-sites/splice-deep/validation.csv => ./dataset/full/splice-sites/validation.csv: False
Error [Errno 28] No space left on device
Error Traceback (most recent call last):
  File "w:\Research\_sequence-processing\data_preparation.py", line 1034, in expand_by_sliding_window_no_pandas
    if _count < _len:
OSError: [Errno 28] No space left on device

Expandi

In [2]:
"""
Checking the size of dataset. It appears that generating expanded sequence resulted in too-large file.
"""
from data_dir import ss_neg_don_hs_csv, ss_neg_acc_hs_csv, ss_pos_acc_hs_csv, ss_pos_don_hs_csv
import pandas as pd

ss_neg_acc_df = pd.read_csv(ss_neg_acc_hs_csv)
ss_neg_don_df = pd.read_csv(ss_neg_don_hs_csv)
ss_pos_acc_df = pd.read_csv(ss_pos_acc_hs_csv)
ss_pos_don_df = pd.read_csv(ss_pos_don_hs_csv)

print("ss pos acc {}".format(len(ss_pos_acc_df)))
print("ss pos don {}".format(len(ss_pos_don_df)))
print("ss neg acc {}".format(len(ss_neg_acc_df)))
print("ss neg don {}".format(len(ss_neg_don_df)))

ss pos acc 248150
ss pos don 250400
ss neg acc 248150
ss neg don 250400


In [6]:
"""
Both positive and negative splice site data is too large. This code is for sampling.
Sampling is done because local storage limitation.
"""
from data_preparation import generate_sample
from data_dir import (
    ss_neg_acc_hs_non_kmer_csv, ss_neg_don_hs_non_kmer_csv, ss_pos_acc_hs_non_kmer_csv, ss_pos_don_hs_non_kmer_csv,
    ss_neg_acc_hs_csv, ss_neg_don_hs_csv, ss_pos_acc_hs_csv, ss_pos_don_hs_csv,
    ss_dir
)
_src = [ss_neg_acc_hs_csv, ss_neg_don_hs_csv, ss_pos_acc_hs_csv, ss_pos_don_hs_csv]
_target = [ss_neg_acc_hs_non_kmer_csv, ss_neg_don_hs_non_kmer_csv, ss_pos_acc_hs_non_kmer_csv, ss_pos_don_hs_non_kmer_csv]
# _target_500 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.500.csv", "ss_neg_don_hs.500.csv", "ss_pos_acc_hs.500.csv", "ss_pos_don_hs.500.csv"]]
_target_1300 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.1300.csv", "ss_neg_don_hs.1300.csv", "ss_pos_acc_hs.1300.csv", "ss_pos_don_hs.1300.csv"]]
_target_2000 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.2000.csv", "ss_neg_don_hs.2000.csv", "ss_pos_acc_hs.2000.csv", "ss_pos_don_hs.2000.csv"]]
_target_3000 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.3000.csv", "ss_neg_don_hs.3000.csv", "ss_pos_acc_hs.3000.csv", "ss_pos_don_hs.3000.csv"]]

#for src, target in zip(_src, _target_500):
#    print("Generate sample for {} => {}: {}".format(src, target, generate_sample(src, target, n_sample=515)))

for src, target in zip(_src, _target_1300):
    print("Generate sample for {} => {}: {}".format(src, target, generate_sample(src, target, n_sample=1300)))

for src, target in zip(_src, _target_2000):
    print("Generate sample for {} => {}: {}".format(src, target, generate_sample(src, target, n_sample=2000)))

for src, target in zip(_src, _target_3000):
    print("Generate sample for {} => {}: {}".format(src, target, generate_sample(src, target, n_sample=3000)))


#_pairs = [(ss_neg_acc_hs_csv, ss_neg_acc_hs_non_kmer_csv), (ss_neg_don_hs_csv, ss_neg_don_hs_non_kmer_csv), (ss_pos_acc_hs_csv, ss_pos_acc_hs_non_kmer_csv), (ss_pos_don_hs_csv, ss_pos_don_hs_non_kmer_csv)]
#for src, target in _pairs:
#    print("Generate sample for {} => {}: {}".format(src, target, generate_sample(src, target, n_sample=500)))

Generate sample for ./data/splice-sites/splice-deep/neg_ss_acc_hs.csv => ./data/splice-sites/splice-deep/ss_neg_acc_hs.1300.csv: ./data/splice-sites/splice-deep/ss_neg_acc_hs.1300.csv
Generate sample for ./data/splice-sites/splice-deep/neg_ss_don_hs.csv => ./data/splice-sites/splice-deep/ss_neg_don_hs.1300.csv: ./data/splice-sites/splice-deep/ss_neg_don_hs.1300.csv
Generate sample for ./data/splice-sites/splice-deep/pos_ss_acc_hs.csv => ./data/splice-sites/splice-deep/ss_pos_acc_hs.1300.csv: ./data/splice-sites/splice-deep/ss_pos_acc_hs.1300.csv
Generate sample for ./data/splice-sites/splice-deep/pos_ss_don_hs.csv => ./data/splice-sites/splice-deep/ss_pos_don_hs.1300.csv: ./data/splice-sites/splice-deep/ss_pos_don_hs.1300.csv
Generate sample for ./data/splice-sites/splice-deep/neg_ss_acc_hs.csv => ./data/splice-sites/splice-deep/ss_neg_acc_hs.2000.csv: ./data/splice-sites/splice-deep/ss_neg_acc_hs.2000.csv
Generate sample for ./data/splice-sites/splice-deep/neg_ss_don_hs.csv => ./data/

In [7]:
"""
Generate kmer version for splice sites data for positive and negative acceptor and donor.
"""
from data_preparation import generate_kmer_csv
from data_dir import (
    ss_neg_acc_hs_non_kmer_csv, ss_neg_don_hs_non_kmer_csv, ss_pos_acc_hs_non_kmer_csv, ss_pos_don_hs_non_kmer_csv,
    ss_neg_acc_hs_kmer_csv, ss_neg_don_hs_kmer_csv, ss_pos_acc_hs_kmer_csv, ss_pos_don_hs_kmer_csv,
)

#_src_500 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.500.csv", "ss_neg_don_hs.500.csv", "ss_pos_acc_hs.500.csv", "ss_pos_don_hs.500.csv"]]
#_target_500 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.500.kmer.csv", "ss_neg_don_hs.500.kmer.csv", "ss_pos_acc_hs.500.kmer.csv", "ss_pos_don_hs.500.kmer.csv"]]
_src_1300 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.1300.csv", "ss_neg_don_hs.1300.csv", "ss_pos_acc_hs.1300.csv", "ss_pos_don_hs.1300.csv"]]
_target_1300 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.1300.kmer.csv", "ss_neg_don_hs.1300.kmer.csv", "ss_pos_acc_hs.1300.kmer.csv", "ss_pos_don_hs.1300.kmer.csv"]]
_src_2000 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.2000.csv", "ss_neg_don_hs.2000.csv", "ss_pos_acc_hs.2000.csv", "ss_pos_don_hs.2000.csv"]]
_target_2000 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.2000.kmer.csv", "ss_neg_don_hs.2000.kmer.csv", "ss_pos_acc_hs.2000.kmer.csv", "ss_pos_don_hs.2000.kmer.csv"]]
_src_3000 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.3000.csv", "ss_neg_don_hs.3000.csv", "ss_pos_acc_hs.3000.csv", "ss_pos_don_hs.3000.csv"]]
_target_3000 = ["{}/{}".format(ss_dir, fname) for fname in ["ss_neg_acc_hs.3000.kmer.csv", "ss_neg_don_hs.3000.kmer.csv", "ss_pos_acc_hs.3000.kmer.csv", "ss_pos_don_hs.3000.kmer.csv"]]

for zip_set in [zip(_src_1300, _target_1300), zip(_src_2000, _target_2000), zip(_src_3000, _target_3000)]:
    for src, target in zip_set:
        print("Generate kmer csv for {} => {}: {}".format(src, target, generate_kmer_csv(src, target)))



#_pairs = [(ss_neg_acc_hs_non_kmer_csv, ss_neg_acc_hs_kmer_csv), (ss_neg_don_hs_non_kmer_csv, ss_neg_don_hs_kmer_csv), (ss_pos_acc_hs_non_kmer_csv, ss_pos_acc_hs_kmer_csv), (ss_pos_don_hs_non_kmer_csv, ss_pos_don_hs_kmer_csv)]
#for src, target in _pairs:
#    print("Generate kmer csv for {} => {}: {}".format(src, target, generate_kmer_csv(src, target)))

Generate kmer csv for ./data/splice-sites/splice-deep/ss_neg_acc_hs.1300.csv => ./data/splice-sites/splice-deep/ss_neg_acc_hs.1300.kmer.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/ss_neg_don_hs.1300.csv => ./data/splice-sites/splice-deep/ss_neg_don_hs.1300.kmer.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/ss_pos_acc_hs.1300.csv => ./data/splice-sites/splice-deep/ss_pos_acc_hs.1300.kmer.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/ss_pos_don_hs.1300.csv => ./data/splice-sites/splice-deep/ss_pos_don_hs.1300.kmer.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/ss_neg_acc_hs.2000.csv => ./data/splice-sites/splice-deep/ss_neg_acc_hs.2000.kmer.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/ss_neg_don_hs.2000.csv => ./data/splice-sites/splice-deep/ss_neg_don_hs.2000.kmer.csv: True
Generate kmer csv for ./data/splice-sites/splice-deep/ss_pos_acc_hs.2000.csv => ./data/splice-sites/splice-deep/ss_pos_acc_h

In [8]:
"""
Split each file into train.csv and validation.csv
"""
from data_preparation import split_and_store_csv
from data_dir import ss_dir
_sizes = [1300, 2000, 3000]
_fraction = [0.9, 0.1]
for _s in _sizes:
    pos_acc = '{}/ss_pos_acc_hs.{}.kmer.csv'.format(ss_dir, _s)
    pos_acc_train = '{}/train.ss_pos_acc_hs.{}.kmer.csv'.format(ss_dir, _s)
    pos_acc_validation = '{}/validation.ss_pos_acc_hs.{}.kmer.csv'.format(ss_dir, _s)
    split_and_store_csv(pos_acc, _fraction, [pos_acc_train, pos_acc_validation])

    pos_don = '{}/ss_pos_don_hs.{}.kmer.csv'.format(ss_dir, _s)
    pos_don_train = '{}/train.ss_pos_don_hs.{}.kmer.csv'.format(ss_dir, _s)
    pos_don_validation = '{}/validation.ss_pos_don_hs.{}.kmer.csv'.format(ss_dir, _s)
    split_and_store_csv(pos_don, _fraction, [pos_don_train, pos_don_validation])
    
    neg_acc = '{}/ss_neg_acc_hs.{}.kmer.csv'.format(ss_dir, _s)
    neg_acc_train = '{}/train.ss_neg_acc_hs.{}.kmer.csv'.format(ss_dir, _s)
    neg_acc_validation = '{}/validation.ss_neg_acc_hs.{}.kmer.csv'.format(ss_dir, _s)
    split_and_store_csv(neg_acc, _fraction, [neg_acc_train, neg_acc_validation])

    neg_don = '{}/ss_neg_don_hs.{}.kmer.csv'.format(ss_dir, _s)
    neg_don_train = '{}/train.ss_neg_don_hs.{}.kmer.csv'.format(ss_dir, _s)
    neg_don_validation = '{}/validation.ss_neg_don_hs.{}.kmer.csv'.format(ss_dir, _s)
    split_and_store_csv(neg_don, _fraction, [neg_don_train, neg_don_validation])



Splitting and storing split to ./data/splice-sites/splice-deep/train.ss_pos_acc_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/validation.ss_pos_acc_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/train.ss_pos_don_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/validation.ss_pos_don_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/train.ss_neg_acc_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/validation.ss_neg_acc_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/train.ss_neg_don_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/validation.ss_neg_don_hs.1300.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/train.ss_pos_acc_hs.2000.kmer.csv
Splitting and storing split to ./data/splice-sites/splice-deep/validation.ss_pos_acc_hs.2000.kmer.csv
Split

In [10]:
"""
Merge all positive acceptor and donor, and negative acceptor and donor into single file for training and single file for validation
"""
from data_dir import ss_neg_acc_hs_kmer_csv, ss_neg_don_hs_kmer_csv, ss_pos_acc_hs_kmer_csv, ss_pos_don_hs_kmer_csv, ss_dir
from data_dir import ss_train_csv
from data_preparation import merge_csv

sizes = [1300, 3000]
for _type in ['train', 'validation']:
    for _size in sizes:
        ss_pos_acc_hs = '{}/{}.ss_pos_acc_hs.{}.kmer.csv'.format(ss_dir, _type, _size)
        ss_pos_don_hs = '{}/{}.ss_pos_don_hs.{}.kmer.csv'.format(ss_dir, _type, _size)
        ss_neg_acc_hs = '{}/{}.ss_neg_acc_hs.{}.kmer.csv'.format(ss_dir, _type, _size)
        ss_neg_don_hs = '{}/{}.ss_neg_don_hs.{}.kmer.csv'.format(ss_dir, _type, _size)
        _files = [ss_pos_acc_hs, ss_pos_don_hs, ss_neg_acc_hs, ss_neg_don_hs]
        _train_file = '{}/{}.{}.kmer.csv'.format(ss_dir, _type, _size)
        print("Merging {} into {}: {}".format(_files, _train_file, merge_csv(_files, _train_file)))

#files = [ss_neg_acc_hs_kmer_csv, ss_neg_don_hs_kmer_csv, ss_pos_acc_hs_kmer_csv, ss_pos_don_hs_kmer_csv]
#print("Merging {} into {}: {}".format(files, ss_train_csv, merge_csv(files, ss_train_csv)))

Merging ['./data/splice-sites/splice-deep/train.ss_pos_acc_hs.1300.kmer.csv', './data/splice-sites/splice-deep/train.ss_pos_don_hs.1300.kmer.csv', './data/splice-sites/splice-deep/train.ss_neg_acc_hs.1300.kmer.csv', './data/splice-sites/splice-deep/train.ss_neg_don_hs.1300.kmer.csv'] into ./data/splice-sites/splice-deep/train.1300.kmer.csv: True
Merging ['./data/splice-sites/splice-deep/train.ss_pos_acc_hs.3000.kmer.csv', './data/splice-sites/splice-deep/train.ss_pos_don_hs.3000.kmer.csv', './data/splice-sites/splice-deep/train.ss_neg_acc_hs.3000.kmer.csv', './data/splice-sites/splice-deep/train.ss_neg_don_hs.3000.kmer.csv'] into ./data/splice-sites/splice-deep/train.3000.kmer.csv: True
Merging ['./data/splice-sites/splice-deep/validation.ss_pos_acc_hs.1300.kmer.csv', './data/splice-sites/splice-deep/validation.ss_pos_don_hs.1300.kmer.csv', './data/splice-sites/splice-deep/validation.ss_neg_acc_hs.1300.kmer.csv', './data/splice-sites/splice-deep/validation.ss_neg_don_hs.1300.kmer.csv']

In [5]:
from data_dir import ss_dir
from data_preparation import expand_by_sliding_window_no_pandas
"""
Expand training data (train.csv) and store the expanded file in dataset full ss folder.
"""
sizes = [1300, 3000]
types = ['train', 'validation']
for _type in types:
    for _size in sizes:
        _source = '{}/{}.{}.kmer.csv'.format(ss_dir, _type, _size)
        _target = '{}/{}.{}.kmer.expanded.csv'.format(ss_dir, _type, _size)
        print("Expanding {} => {}: {}".format(_source,_target,expand_by_sliding_window_no_pandas(_source,_target, length=510)))

Error File ./data/splice-sites/splice-deep/train.1250.kmer.csv not found.
Error Traceback (most recent call last):
  File "w:\Research\sequence-processing\data_preparation.py", line 1175, in expand_by_sliding_window_no_pandas
    print("Expanding {} [{}/{}]".format(src_csv, _count, _len_src), end='\r')
Exception: File ./data/splice-sites/splice-deep/train.1250.kmer.csv not found.

Expanding ./data/splice-sites/splice-deep/train.1250.kmer.csv => ./data/splice-sites/splice-deep/train.1250.kmer.expanded.csv: False
Expanding ./data/splice-sites/splice-deep/train.2000.kmer.csv [5648/7201]

KeyboardInterrupt: 

In [4]:
from data_preparation import kmer
import os
from tqdm import tqdm

def _prep_ss_csv(file_path, target_csv_dir, label):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found. Please provide source.")
    f = open(file_path, 'r')
    _columns = ['index', 'sequence', 'label']
    _columns = ','.join(_columns)
    src_name = os.path.basename(file_path)
    if not os.path.exists(target_csv_dir):
        os.makedirs(target_csv_dir, exist_ok=True)
    line_number = 0
    for line in tqdm(f, total=248150):
        # A line always has 602 characters so break it into 512 chunks.
        chunks = kmer(line, 512)
        line_number += 1
        # Each chunk is saved into different files.
        for i in range(len(chunks)):
            _tname = src_name.split('.')[0]
            tname = "{}.{}.csv".format(_tname, i)
            tpath = os.path.join(target_csv_dir, tname)
            if not os.path.exists(tpath):
                t = open(tpath, 'x')
                t.write(f"{_columns}\n")
                t.close()
            else:
                t = open(tpath, 'a')
                t.write(f"{line_number},{chunks[i]},{label}\n")
                t.close()

target_csv_dir = os.path.join('workspace', 'splice-sites', 'pos_acc_hs')
from data_dir import ss_pos_acc_hs_fasta
_prep_ss_csv(ss_pos_acc_hs_fasta, target_csv_dir, 1)

  1%|▏         | 3108/248150 [02:23<3:09:05, 21.60it/s]


KeyboardInterrupt: 