In [22]:
"""
Generate CSV from FASTAs.
"""

import os

path = os.path.join("data", "promoter", "epd")
positives = [
    os.path.join(path, "human_tata_min128_plus383.fasta"),
    os.path.join(path, "human_tata_min256_plus255.fasta"),
    os.path.join(path, "human_tata_min384_plus127.fasta"),
    #os.path.join(path, "human_tata_min499_plus100.fasta"),
]
negatives = [
    os.path.join(path, "human_non_tata_min128_plus383.fasta"),
    os.path.join(path, "human_non_tata_min256_plus255.fasta"),
    os.path.join(path, "human_non_tata_min384_plus127.fasta"),
    #os.path.join(path, "human_non_tata_min499_plus100.fasta"),
]

from data_preparation import generate_csv_from_fasta
for p in positives:
    bname = os.path.basename(p)
    target_path = os.path.join(path, f"{bname.split('.')[0]}.csv")
    generate_csv_from_fasta(p, target_path, 1)
for n in negatives:
    bname = os.path.basename(n)
    target_path = os.path.join(path, f"{bname.split('.')[0]}.csv")
    generate_csv_from_fasta(p, target_path, 0)

In [23]:
"""
Generate kmer format.
"""
import os
path = os.path.join("data", "promoter", "epd")
positives = [
    os.path.join(path, "human_tata_min128_plus383.csv"),
    os.path.join(path, "human_tata_min256_plus255.csv"),
    os.path.join(path, "human_tata_min384_plus127.csv"),
    #os.path.join(path, "human_tata_min499_plus100.csv"),
]
negatives = [
    os.path.join(path, "human_non_tata_min128_plus383.csv"),
    os.path.join(path, "human_non_tata_min256_plus255.csv"),
    os.path.join(path, "human_non_tata_min384_plus127.csv"),
    #os.path.join(path, "human_non_tata_min499_plus100.csv"),
]

from data_preparation import generate_kmer_csv
for p in positives:
    bname = os.path.basename(p)
    target_path = os.path.join(path, f"{bname.split('.')[0]}.kmer.csv")
    generate_kmer_csv(p, target_path)
for n in negatives:
    bname = os.path.basename(n)
    target_path = os.path.join(path, f"{bname.split('.')[0]}.kmer.csv")
    generate_kmer_csv(n, target_path)


Generating kmer for <data\promoter\epd\human_non_tata_min384_plus127.csv>: 3065/3065                                                      

In [24]:
"""
Split into train and validation.
"""
import os
path = os.path.join("data", "promoter", "epd")
positives = [
    os.path.join(path, "human_tata_min128_plus383.kmer.csv"),
    os.path.join(path, "human_tata_min256_plus255.kmer.csv"),
    os.path.join(path, "human_tata_min384_plus127.kmer.csv"),
    #os.path.join(path, "human_tata_min499_plus100.kmer.csv"),
]
negatives = [
    os.path.join(path, "human_non_tata_min128_plus383.kmer.csv"),
    os.path.join(path, "human_non_tata_min256_plus255.kmer.csv"),
    os.path.join(path, "human_non_tata_min384_plus127.kmer.csv"),
    #os.path.join(path, "human_non_tata_min499_plus100.kmer.csv"),
]

import pandas as pd
positives_negatives = [positives, negatives]
for pn in positives_negatives:
    for p in pn:
        df = pd.read_csv(p)
        train_df = df.sample(frac=0.8)
        validation_df = df.drop(train_df.index)
        train_df.to_csv(
            os.path.join(path, f"{'.'.join(os.path.basename(p).split('.')[0:2])}.train.csv"),
            index=False
        )
        validation_df.to_csv(
            os.path.join(path, f"{'.'.join(os.path.basename(p).split('.')[0:2])}.validation.csv"),
            index=False
        )

In [25]:
"""
Merge CSV which is already in kmer version.
"""
import os
from data_preparation import merge_csv
path = os.path.join("data", "promoter", "epd")
trains = [
    os.path.join(path, "human_tata_min128_plus383.kmer.train.csv"),
    os.path.join(path, "human_tata_min256_plus255.kmer.train.csv"),
    os.path.join(path, "human_tata_min384_plus127.kmer.train.csv"),
    #os.path.join(path, "human_tata_min499_plus100.kmer.train.csv"),
    os.path.join(path, "human_non_tata_min128_plus383.kmer.train.csv"),
    os.path.join(path, "human_non_tata_min256_plus255.kmer.train.csv"),
    os.path.join(path, "human_non_tata_min384_plus127.kmer.train.csv"),
    #os.path.join(path, "human_non_tata_min499_plus100.kmer.train.csv"),
]
merge_csv(trains, os.path.join(path, "human_tata.kmer.train.csv"))
validations = [
    os.path.join(path, "human_tata_min128_plus383.kmer.validation.csv"),
    os.path.join(path, "human_tata_min256_plus255.kmer.validation.csv"),
    os.path.join(path, "human_tata_min384_plus127.kmer.validation.csv"),
    #os.path.join(path, "human_tata_min499_plus100.kmer.validation.csv"),
    os.path.join(path, "human_non_tata_min128_plus383.kmer.validation.csv"),
    os.path.join(path, "human_non_tata_min256_plus255.kmer.validation.csv"),
    os.path.join(path, "human_non_tata_min384_plus127.kmer.validation.csv"),
    #os.path.join(path, "human_non_tata_min499_plus100.kmer.validation.csv"),
]
merge_csv(trains, os.path.join(path, "human_tata.kmer.validation.csv"))


100%|██████████| 6/6 [00:00<00:00, 10.98it/s]
100%|██████████| 6/6 [00:00<00:00, 11.37it/s]


True

In [16]:
s = "ATT TTG TGG GGG GGG GGC GCA CAA AAT ATG TGC GCT CTC TCC CCT CTG TGT GTT TTA TAT ATC TCC CCA CAC ACA CAC ACA CAG AGT GTG TGT GTC TCA CAT ATT TTT TTA TAA AAG AGG GGG GGT GTG TGA GAC ACA CAT ATT TTA TAG AGA GAG AGA GAT ATG TGG GGC GCA CAT ATG TGA GAA AAG AGT GTG TGC GCC CCC CCA CAG AGC GCA CAC ACA CAG AGA GAG AGC GCA CAG AGG GGT GTG TGC GCT CTC TCA CAC ACA CAG AGG GGC GCT CTG TGT GTT TTG TGG GGG GGT GTC TCC CCC CCC CCA CAC ACC CCC CCT CTA TAC ACT CTG TGA GAC ACA CAC ACA CAC ACC CCA CAA AAG AGC GCA CAC ACA CAC ACG CGT GTC TCT CTG TGC GCC CCC CCT CTG TGC GCC CCA CAT ATG TGG GGT GTG TGG GGG GGG GGA GAG AGC GCA CAA AAA AAC ACA CAA AAT ATG TGT GTT TTA TAT ATG TGG GGT GTT TTC TCC CCG CGA GAC ACT CTC TCC CCC CCC CCG CGG GGG GGA GAG AGG GGC GCC CCA CAG AGG GGC GCC CCG CGG GGG GGC GCA CAT ATT TTT TTC TCA CAC ACT CTG TGT GTA TAG AGG GGA GAT ATG TGT GTT TTG TGA GAA AAG AGC GCC CCG CGC GCA CAG AGG GGC GCA CAG AGT GTC TCG CGT GTC TCG CGC GCT CTG TGC GCG CGA GAG AGA GAA AAG AGA GAG AGA GAG AGC GCC CCT CTG TGG GGC GCT CTC TCT CTG TGA GAT ATG TGT GTC TCC CCA CAC ACT CTG TGT GTC TCT CTT TTC TCC CCA CAG AGG GGG GGA GAG AGC GCC CCT CTG TGG GGT GTG TGG GGG GGA GAA AAG AGG GGA GAA AAG AGG GGA GAG AGT GTG TGG GGG GGC GCA CAG AGC GCG CGG GGC GCC CCC CCC CCT CTC TCG CGC GCT CTC TCT CTG TGC GCG CGG GGG GGC GCC CCT CTC TCT CTC TCC CCT CTG TGC GCC CCC CCT CTT TTT TTG TGT GTA TAC ACT CTC TCC CCA CAC ACG CGA GAG AGG GGT GTG TGT GTG TGA GAG AGG GGA GAA AAG AGT GTT TTG TGC GCC CCG CGG GGG GGT GTC TCA CAC ACC CCC CCA CAG AGC GCA CAG AGA GAG AGG GGG GGA GAG AGA GAG AGG GGT GTG TGA GAT ATG TGT GTC TCC CCC CCC CCT CTC TCT CTG TGT GTT TTC TCT CTT TTC TCC CCT CTA TAA AAT ATT TTT TTG TGT GTA TAG AGC GCT CTA TAA AAT ATC TCA CAG AGA GAT ATT TTC TCT CTG TGC GCC CCT CTG TGC GCA CAG AGC GCC CCA CAG AGG GGC GCT CTC TCT CTC TCT CTG TGG GGG GGG GGA GAT ATG TGA GAA AAT ATC TCT CTA TAA AAT ATA TAA AAA AAG AGG GGA GAG AGG GGC GCG CGG GGC GCG CGA GAC ACC CCT CTG TGG GGA GAG AGA GAG AGG GGA GAG AGG GGA GAC ACA CAG AGA GAC ACA CAA AAC ACA CAA AAG AGT GTG TGA GAC ACC CCA CAG AGG GGG GGG GGG GGG GGA GAC ACA CAG AGC GCA CAC ACT CTT TTC TCT CTT TTT TTC TCA CAG AGT GTA TAC ACA CAT ATT TTT TTT TTC TCC CCT CTG TGT GTT TTA TAG AGA GAC ACT CTC TCG CGG GGG GGG GGT GTA TAT ATT TTA TAG AGA GAG AGA GAC ACC CCC CCA CAA AAA AAG AGA GAT ATA TAC ACC CCC CCA CAA AAG AGG GGA GAG AGG GGA GAG AGT GTG TGA GAG AGA GAG AGC GCG"
s = s.split(' ')
len(s)

598