In [None]:
# B.1.1.7 (Alpha), 
ALPHA_FASTA_PATH="data/raw/complete-nucl-sars_cov_2-B.1.1.7-human_origin.fasta"
# B.1.351 (Beta), 
BETA_FASTA_PATH="data/raw/complete-nucl-sars_cov_2-B.1.351-human_origin.fasta"
# B.1.617.2 (Delta), 
DELTA_FASTA_PATH="data/raw/complete-nucl-sars_cov_2-B.1.617.2-human_origin.fasta"
# and P.1 (Gamma)

ALPHA_CLASS = 1
BETA_CLASS = 2
DELTA_CLASS = 3

K_MER_6 = 6
K_MER_5 = 5
K_MER_4 = 4
K_MER_3 = 3
K_MERS = [K_MER_3, K_MER_4, K_MER_5, K_MER_6]
N_K_MER = 100 # How many k-mers are retrieved per sequence.
N_SAMPLES = 100 # How many sequences are retrieved.

"""
len ALPHA 169317
len BETA 477
len DELTA 3848
"""

PREFIX = 'sarscov2'
DEST_DIR = 'data'
DEST_DIR_ALPHA = DEST_DIR + '/alpha'
DEST_DIR_BETA = DEST_DIR + '/beta'
DEST_DIR_DELTA = DEST_DIR + '/delta'

DATA_DIR = "data"
DROSOPHILA_DIR = "{}/drosophila/ncbi_dataset/data".format(DATA_DIR)
ECOLI_DIR = "{}/ecoli/ncbi_dataset/data".format(DATA_DIR)
EULEMUR_DIR = "{}/eulemur/ncbi_dataset/data".format(DATA_DIR)
PAPIO_DIR = "{}/papio/ncbi_dataset/data".format(DATA_DIR)
SARSCOV2_DIR = "{}/sarscov2".format(DATA_DIR)

In [None]:
#  Data Preprocessing
import Bio
from utils import create_k_mer
from files import merge_files, generate_sample_fine_tuning_file

# Generate K-Mer from sequence
s = "GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT"
print(create_k_mer(s, 3, -1))

In [None]:
# Generate Raw K-Mer from Fasta files.
# generate_sample_fine_tuning_file(ALPHA_FASTA_PATH, ALPHA_CLASS, 'raw_alpha_'+str(K_MER)+'.txt', -1, K_MER, -1)
generate_sample_fine_tuning_file(BETA_FASTA_PATH, BETA_CLASS, 'raw_beta_'+str(K_MER)+'.txt', -1, K_MER, -1)
generate_sample_fine_tuning_file(DELTA_FASTA_PATH, DELTA_CLASS, 'raw_delta_'+str(K_MER)+'.txt', -1, K_MER, -1)

In [None]:
# Splitting the sequence collection into files based on its sequence id in fasta.
# Filename = prefix-sequence_id_from_fasta-class-k_mer.txt
# @param fasta_file : Fasta file as source.
# @param prefix : Filename prefix.
# @param class_name : The class for this fasta file in number (0, 1, 2, etc.)
# @k_mer_size : Size of k-mer
# @dest_dir : Intended file directory.
def generate_sequence_file(fasta_file, prefix, class_name, k_mer_size, dest_dir):
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    for record in records:
        if not (os.path.exists(dest_dir)):
            # os.mkdir(dest_dir)
            import pathlib
            pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True)
        output_file_name = dest_dir + '/' + prefix + '-' + record.id + str(class_name) + str(k_mer_size) + '.txt'
        if (os.path.exists(output_file_name)):
            os.remove(output_file_name)
        output_file = open(output_file_name, 'w+')
        seq = create_k_mer(str(record.seq), k_mer_size, -1)
        output_file.write(seq + '\t' + str(class_name))
        output_file.close()

In [None]:
# Generate individual sequence.
# generate_sequence_file(ALPHA_FASTA_PATH, PREFIX, ALPHA_CLASS, -1, DEST_DIR) # don't do it. source file is too big.
generate_sequence_file(BETA_FASTA_PATH, PREFIX, BETA_CLASS, -1, DEST_DIR_BETA)
generate_sequence_file(DELTA_FASTA_PATH, PREFIX, DELTA_CLASS, -1, DEST_DIR_DELTA)

In [11]:
from Bio import SeqIO
from utils import create_k_mer
import os

# Generate data for predictions.
# @param fasta_file : Path to fasta file.
# @param output_file_path : What and where the prediction file is named and stored. 
#                           If file path exists, existing file will be removed.
# @seq_index : From where sequence is read.
# @n_samples : How many sequences are used to create prediction file.
# @k_mer : Size of k-mer.
# @n_k_mer : How many kmers are written to file for each sequence in fasta file.
# @label : Label of this prediction.
def generate_data_to_predict(fasta_file, output_file_path, seq_index, n_samples, k_mer, n_k_mer, label):
    records = list(SeqIO.parse(fasta_file, 'fasta'))
    print('reading source file {}'.format(fasta_file))
    if (len(records)) > n_samples:
        records = records[seq_index:n_samples]
    
    print('writing {} records at target file {}'.format(len(records), output_file_path))
    if (os.path.exists(output_file_path)):
        print("File exists at {} -> removing existing".format(output_file_path))
        os.remove(output_file_path)
    
    output_file = open(output_file_path, 'w+')
    output_file.write('sequence' + '\t' + 'label' + '\n')
    for r in records:
        output_file.write(create_k_mer(str(r.seq), k_mer, n_k_mer) + '\t' + str(label) + '\n')
    output_file.close()

In [15]:
# Dataset COVID19
# Generate data for fine tuning
ALPHA_PATH = "data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.1.7-human_origin.fasta"
BETA_PATH = "data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.351-human_origin.fasta"
DELTA_PATH = "data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.617.2-human_origin.fasta"
K_MERS = [3,4,5,6]
N_K_MER = 100 # How many k-mers are retrieved per sequence.
N_SAMPLES = 100 # How many sequences are retrieved.

for k in K_MERS:
    # Generate fine tuning data
    generate_data_to_predict(ALPHA_PATH, 'data/sarscov2/ft/train-alpha-{}.txt'.format(k), 0, 4*N_SAMPLES, k, N_K_MER, 0)
    generate_data_to_predict(BETA_PATH, 'data/sarscov2/ft/train-beta-{}.txt'.format(k), 0, 4*N_SAMPLES, k, N_K_MER, 1)
    generate_data_to_predict(DELTA_PATH, 'data/sarscov2/ft/train-delta-{}.txt'.format(k), 0, 4*N_SAMPLES, k, N_K_MER, 2)
    
    # Generate test data
    generate_data_to_predict(ALPHA_PATH, 'data/sarscov2/ft/dev-alpha-{}.txt'.format(k), 400, 451, k, N_K_MER, 0)
    generate_data_to_predict(BETA_PATH, 'data/sarscov2/ft/dev-beta-{}.txt'.format(k), 400, 451, k, N_K_MER, 1)
    generate_data_to_predict(DELTA_PATH, 'data/sarscov2/ft/dev-delta-{}.txt'.format(k), 400, 451, k, N_K_MER, 2)


reading source file data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.1.7-human_origin.fasta
writing 400 records at target file data/sarscov2/ft/train-alpha-3.txt
reading source file data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.351-human_origin.fasta
writing 400 records at target file data/sarscov2/ft/train-beta-3.txt
reading source file data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.617.2-human_origin.fasta
writing 400 records at target file data/sarscov2/ft/train-delta-3.txt
reading source file data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.1.7-human_origin.fasta
writing 51 records at target file data/sarscov2/ft/dev-alpha-3.txt
File exists at data/sarscov2/ft/dev-alpha-3.txt -> removing existing
reading source file data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.351-human_origin.fasta
writing 51 records at target file data/sarscov2/ft/dev-beta-3.txt
File exists at data/sarscov2/ft/dev-beta-3.txt -> removing existing
reading source file data/sarscov2/raw/complete-nucl-sars_cov_2-B.1.61

In [14]:
records = list(SeqIO.parse(ALPHA_PATH, 'fasta'))
print(len(records))
records = list(SeqIO.parse(BETA_PATH, 'fasta'))
print(len(records))
records = list(SeqIO.parse(DELTA_PATH, 'fasta'))
print(len(records))

169317
477
3848


In [16]:
from files import merge_files

for k in K_MERS:
    merge_files(
        [
            "data/sarscov2/ft/train-alpha-{}.txt".format(k),
            "data/sarscov2/ft/train-beta-{}.txt".format(k),
            "data/sarscov2/ft/train-delta-{}.txt".format(k)
        ],
        'data/sarscov2/ft/train-{}.tsv'.format(k),
        ['sequence', 'label']
    )
    merge_files(
        [
            "data/sarscov2/ft/dev-alpha-{}.txt".format(k),
            "data/sarscov2/ft/dev-beta-{}.txt".format(k),
            "data/sarscov2/ft/dev-delta-{}.txt".format(k)
        ],
        'data/sarscov2/ft/dev-{}.tsv'.format(k),
        ['sequence', 'label']
    )

reading file data/sarscov2/ft/train-alpha-3.txt
reading file data/sarscov2/ft/train-beta-3.txt
reading file data/sarscov2/ft/train-delta-3.txt
reading file data/sarscov2/ft/dev-alpha-3.txt
reading file data/sarscov2/ft/dev-beta-3.txt
reading file data/sarscov2/ft/dev-delta-3.txt
reading file data/sarscov2/ft/train-alpha-4.txt
reading file data/sarscov2/ft/train-beta-4.txt
reading file data/sarscov2/ft/train-delta-4.txt
reading file data/sarscov2/ft/dev-alpha-4.txt
reading file data/sarscov2/ft/dev-beta-4.txt
reading file data/sarscov2/ft/dev-delta-4.txt
reading file data/sarscov2/ft/train-alpha-5.txt
reading file data/sarscov2/ft/train-beta-5.txt
reading file data/sarscov2/ft/train-delta-5.txt
reading file data/sarscov2/ft/dev-alpha-5.txt
reading file data/sarscov2/ft/dev-beta-5.txt
reading file data/sarscov2/ft/dev-delta-5.txt
reading file data/sarscov2/ft/train-alpha-6.txt
reading file data/sarscov2/ft/train-beta-6.txt
reading file data/sarscov2/ft/train-delta-6.txt
reading file data

In [22]:
from Bio import SeqIO

human_TATA_promotor_file = "data/homo-sapiens/human_TATA_hg38.fa"
human_nonTATA_promotor_file = "data/homo-sapiens/human_nonTATA_hg38.fa"

# Dataset Human Genome Promotor
human_TATA_records = list(SeqIO.parse(human_TATA_promotor_file, 'fasta'))
human_nonTATA_records = list(SeqIO.parse(human_nonTATA_promotor_file, 'fasta'))
print(len(human_TATA_records))
print(len(human_nonTATA_records))

3065
26533


In [25]:
N_SAMPLES = 3000
LABEL_NON_TATA = 0
LABEL_TATA = 1
K_MERS = [3,4,5,6]

for k in K_MERS:
    # Generate fine tuning data.
    generate_data_to_predict(human_TATA_promotor_file, 'data/homo-sapiens/ft/train-TATA-{}.txt'.format(k), 0, N_SAMPLES, k, N_K_MER, LABEL_TATA)
    generate_data_to_predict(human_nonTATA_promotor_file, 'data/homo-sapiens/ft/train-nonTATA-{}.txt'.format(k), 0, N_SAMPLES, k, N_K_MER, LABEL_NON_TATA)
    
    # Generate test data.
    generate_data_to_predict(human_TATA_promotor_file, 'data/homo-sapiens/ft/dev-TATA-{}.txt'.format(k), N_SAMPLES, N_SAMPLES+50, k, N_K_MER, LABEL_TATA)
    generate_data_to_predict(human_nonTATA_promotor_file, 'data/homo-sapiens/ft/dev-nonTATA-{}.txt'.format(k), N_SAMPLES, N_SAMPLES+50, k, N_K_MER, LABEL_NON_TATA)
    


reading source file data/homo-sapiens/ft/human_TATA_hg38.fa
writing 3000 records at target file data/homo-sapiens/ft/train-TATA-3.txt
File exists at data/homo-sapiens/ft/train-TATA-3.txt -> removing existing
reading source file data/homo-sapiens/ft/human_nonTATA_hg38.fa
writing 3000 records at target file data/homo-sapiens/ft/train-nonTATA-3.txt
File exists at data/homo-sapiens/ft/train-nonTATA-3.txt -> removing existing
reading source file data/homo-sapiens/ft/human_TATA_hg38.fa
writing 50 records at target file data/homo-sapiens/ft/dev-TATA-3.txt
File exists at data/homo-sapiens/ft/dev-TATA-3.txt -> removing existing
reading source file data/homo-sapiens/ft/human_nonTATA_hg38.fa
writing 50 records at target file data/homo-sapiens/ft/dev-nonTATA-3.txt
reading source file data/homo-sapiens/ft/human_TATA_hg38.fa
writing 3000 records at target file data/homo-sapiens/ft/train-TATA-4.txt
reading source file data/homo-sapiens/ft/human_nonTATA_hg38.fa
writing 3000 records at target file data

In [26]:
from files import merge_files

for k in K_MERS:
    merge_files(
        [
            "data/homo-sapiens/ft/train-TATA-{}.txt".format(k),
            "data/homo-sapiens/ft/train-nonTATA-{}.txt".format(k)
        ],
        'data/homo-sapiens/ft/train-{}.tsv'.format(k),
        ['sequence', 'label']
    )
    merge_files(
        [
            "data/homo-sapiens/ft/dev-TATA-{}.txt".format(k),
            "data/homo-sapiens/ft/dev-nonTATA-{}.txt".format(k)
        ],
        'data/homo-sapiens/ft/dev-{}.tsv'.format(k),
        ['sequence', 'label']
    )

reading file data/homo-sapiens/ft/train-TATA-3.txt
reading file data/homo-sapiens/ft/train-nonTATA-3.txt
reading file data/homo-sapiens/ft/dev-TATA-3.txt
reading file data/homo-sapiens/ft/dev-nonTATA-3.txt
reading file data/homo-sapiens/ft/train-TATA-4.txt
reading file data/homo-sapiens/ft/train-nonTATA-4.txt
reading file data/homo-sapiens/ft/dev-TATA-4.txt
reading file data/homo-sapiens/ft/dev-nonTATA-4.txt
reading file data/homo-sapiens/ft/train-TATA-5.txt
reading file data/homo-sapiens/ft/train-nonTATA-5.txt
reading file data/homo-sapiens/ft/dev-TATA-5.txt
reading file data/homo-sapiens/ft/dev-nonTATA-5.txt
reading file data/homo-sapiens/ft/train-TATA-6.txt
reading file data/homo-sapiens/ft/train-nonTATA-6.txt
reading file data/homo-sapiens/ft/dev-TATA-6.txt
reading file data/homo-sapiens/ft/dev-nonTATA-6.txt


In [30]:
from Bio import SeqIO

human_CCAAT_promotor_file = "data/homo-sapiens/human_CCAAT_hg38.fa"
human_nonCCAAT_promotor_file = "data/homo-sapiens/human_nonCCAAT_hg38.fa"

# Dataset Human Genome Promotor
human_CCAAT_records = list(SeqIO.parse(human_CCAAT_promotor_file, 'fasta'))
human_nonCCAAT_records = list(SeqIO.parse(human_nonCCAAT_promotor_file, 'fasta'))
print(len(human_CCAAT_records))
print(len(human_nonCCAAT_records))

4804
24794


In [31]:
N_SAMPLES = 4500
LABEL_NON_CCAAT = 0
LABEL_CCAAT = 1
K_MERS = [3,4,5,6]

for k in K_MERS:
    # Generate fine tuning data.
    generate_data_to_predict(human_CCAAT_promotor_file, 'data/homo-sapiens/ft/train-CCAAT-{}.txt'.format(k), 0, N_SAMPLES, k, N_K_MER, LABEL_TATA)
    generate_data_to_predict(human_nonCCAAT_promotor_file, 'data/homo-sapiens/ft/train-nonCCAAT-{}.txt'.format(k), 0, N_SAMPLES, k, N_K_MER, LABEL_NON_TATA)
    
    # Generate test data.
    generate_data_to_predict(human_CCAAT_promotor_file, 'data/homo-sapiens/ft/dev-CCAAT-{}.txt'.format(k), N_SAMPLES, N_SAMPLES+50, k, N_K_MER, LABEL_TATA)
    generate_data_to_predict(human_nonCCAAT_promotor_file, 'data/homo-sapiens/ft/dev-nonCCAAT-{}.txt'.format(k), N_SAMPLES, N_SAMPLES+50, k, N_K_MER, LABEL_NON_TATA)


reading source file data/homo-sapiens/human_CCAAT_hg38.fa
writing 4500 records at target file data/homo-sapiens/ft/train-CCAAT-3.txt
File exists at data/homo-sapiens/ft/train-CCAAT-3.txt -> removing existing
reading source file data/homo-sapiens/human_nonCCAAT_hg38.fa
writing 4500 records at target file data/homo-sapiens/ft/train-nonCCAAT-3.txt
File exists at data/homo-sapiens/ft/train-nonCCAAT-3.txt -> removing existing
reading source file data/homo-sapiens/human_CCAAT_hg38.fa
writing 50 records at target file data/homo-sapiens/ft/dev-CCAAT-3.txt
reading source file data/homo-sapiens/human_nonCCAAT_hg38.fa
writing 50 records at target file data/homo-sapiens/ft/dev-nonCCAAT-3.txt
reading source file data/homo-sapiens/human_CCAAT_hg38.fa
writing 4500 records at target file data/homo-sapiens/ft/train-CCAAT-4.txt
reading source file data/homo-sapiens/human_nonCCAAT_hg38.fa
writing 4500 records at target file data/homo-sapiens/ft/train-nonCCAAT-4.txt
reading source file data/homo-sapiens/h

In [32]:
from files import merge_files

for k in K_MERS:
    merge_files(
        [
            "data/homo-sapiens/ft/train-CCAAT-{}.txt".format(k),
            "data/homo-sapiens/ft/train-nonCCAAT-{}.txt".format(k)
        ],
        'data/homo-sapiens/ft/ccaat-promoter/train-{}.tsv'.format(k),
        ['sequence', 'label']
    )
    merge_files(
        [
            "data/homo-sapiens/ft/dev-CCAAT-{}.txt".format(k),
            "data/homo-sapiens/ft/dev-nonCCAAT-{}.txt".format(k)
        ],
        'data/homo-sapiens/ft/ccaat-promoter/dev-{}.tsv'.format(k),
        ['sequence', 'label']
    )

reading file data/homo-sapiens/ft/train-CCAAT-3.txt
reading file data/homo-sapiens/ft/train-nonCCAAT-3.txt
reading file data/homo-sapiens/ft/dev-CCAAT-3.txt
reading file data/homo-sapiens/ft/dev-nonCCAAT-3.txt
reading file data/homo-sapiens/ft/train-CCAAT-4.txt
reading file data/homo-sapiens/ft/train-nonCCAAT-4.txt
reading file data/homo-sapiens/ft/dev-CCAAT-4.txt
reading file data/homo-sapiens/ft/dev-nonCCAAT-4.txt
reading file data/homo-sapiens/ft/train-CCAAT-5.txt
reading file data/homo-sapiens/ft/train-nonCCAAT-5.txt
reading file data/homo-sapiens/ft/dev-CCAAT-5.txt
reading file data/homo-sapiens/ft/dev-nonCCAAT-5.txt
reading file data/homo-sapiens/ft/train-CCAAT-6.txt
reading file data/homo-sapiens/ft/train-nonCCAAT-6.txt
reading file data/homo-sapiens/ft/dev-CCAAT-6.txt
reading file data/homo-sapiens/ft/dev-nonCCAAT-6.txt


In [1]:
TATA_CCAAT_FILE = "data/homo-sapiens/human_TATA_CCAAT_hg38.fa"
TATA_nonCCAAT_FILE = "data/homo-sapiens/human_TATA_nonCCAAT_hg38.fa"
nonTATA_CCAAT_FILE = "data/homo-sapiens/human_nonTATA_CCAAT_hg38.fa"
nonTATA_nonCCAAT_FILE = "data/homo-sapiens/human_nonTATA_nonCCAAT_hg38.fa"

from Bio import SeqIO

tata_ccaat_records = list(SeqIO.parse(TATA_CCAAT_FILE, 'fasta'))
tata_nonccaat_records = list(SeqIO.parse(TATA_nonCCAAT_FILE, 'fasta'))
nontata_ccaat_records = list(SeqIO.parse(nonTATA_CCAAT_FILE, 'fasta'))
nontata_nonccaat_records = list(SeqIO.parse(nonTATA_nonCCAAT_FILE, 'fasta'))

print('len of tata ccaat records : {}'.format(len(tata_ccaat_records))) # 672
print('len of tata non-ccaat records : {}'.format(len(tata_nonccaat_records))) # 2393
print('len of non tata ccaat records : {}'.format(len(nontata_ccaat_records))) # 4132
print('len of non tata non ccaat records : {}'.format(len(nontata_nonccaat_records))) # 22401

N_SAMPLES = 4500
LABEL_NON_CCAAT = 0
LABEL_CCAAT = 1
K_MERS = [3,4,5,6]



len of tata ccaat records : 672
len of tata non-ccaat records : 2393
len of non tata ccaat records : 4132
len of non tata non ccaat records : 22401


In [4]:
# Generate TATA positive-dataset.
human_tata_ccaat = "data/homo-sapiens/human_TATA_CCAAT_hg38.fa"
human_tata_nonccaat = "data/homo-sapiens/human_TATA_nonCCAAT_hg38.fa"
human_tata = "data/homo-sapiens/human_TATA_hg38.fa"

human_nontata_ccaat = "data/homo-sapiens/human_nonTATA_CCAAT_hg38.fa"
human_nontata_nonccaat = "data/homo-sapiens/human_nonTATA_nonCCAAT_hg38.fa"
human_nontata = "data/homo-sapiens/human_nonTATA_hg38.fa"

positive_tata = [human_tata_ccaat, human_tata_nonccaat, human_tata]
negative_tata = [human_nontata_ccaat, human_nontata_nonccaat, human_nontata]

positive_label = "1"
negative_label = "0"

# This data is generated for DeePromoter so generate 300bp only for each sequence.
# Data is in FASTA format so use SeqIO from Bio
positive_dataset_path = "/deepromoter/human_tata_positive.txt"
negative_dataset_path = "/deepromoter/human_tata_negative.txt"

from pathlib import Path
Path(positive_dataset_path).mkdir(parents=True, exist_ok=True)
Path(negative_dataset_path).mkdir(parents=True, exist_ok=True)

from Bio import SeqIO

# Begin process positive tata.
file = open(positive_dataset_path, 'w+')
for p in positive_tata:
    records = list(SeqIO.parse(p, "fasta"))
    records = records[0:100] # Get only first 100 records.
    for r in records:
        sequence = str(r.seq)[0:300] # Retrieve only first 300 bp.
        file.write(sequence + '\n')
        
file.close()

# Begin process negative tata.
file = open(negative_dataset_path, 'w+')
for p in negative_tata:
    records = list(SeqIO.parse(p, 'fasta'))
    records = records[0:100]
    for r in records:
        sequence = str(r.seq)[0:300] # Retrieve only first 300 bp.
        file.write(sequence + '\n')

file.close()

PermissionError: [Errno 13] Permission denied: '/deepromoter/human_tata_positive.txt'