In [37]:
"""
This notebook should only be used for processing FASTA or similar files.
"""
from Bio import SeqIO
import os

def kmer(seq, length, window_size=1):
    return [seq[i:i+length] for i in range(0, len(seq)+1-length, window_size)]

epd_tata_path = './data/epd/tata.fasta'
tatas = SeqIO.parse(epd_tata_path, 'fasta')
epd_tata_csv = './data/epd/pos_tata.csv'
columns = ['sequence', 'label']
fpos = {}
try:
    if os.path.exists(epd_tata_csv):
        os.remove(epd_tata_csv)
    fpos = open(epd_tata_csv, 'x')
    fpos.write('{},{}\n'.format('sequence', 'label'))
    for f in tatas:
        seq = f.seq
        if len(seq) > 512:
            arr = kmer(seq, 512)
            for seq in arr:
                fpos.write('{},{}\n'.format(seq, 1))
        else:
            fpos.write('{},{}\n'.format(seq, 1))
    fpos.close()
except Exception as e:
    print('error {}'.format(e))
    fpos.close()


In [68]:
s = 'CTTTTCCCCATTCCATAGAATTGTTTTAGTATGGATGCTGTACAAAGAGTTCATGCCTGGGGGACTAAAGGAAAGTTCTCAAGTTCTTTTAATTTTGATTTGCATCAAAAAGTCCGCTTAATTTCAGGTTTCACCTCCAGGAGTCAATCATTATGGACTTAAAGCTATAGTAACTTTTGAAGATCGTTTGTAATTCGCTGTCTTTGCTAGGTTTTCTTTTCTAAGCAGGCTCATTTCTGTAGTTCAGAAACTTAGCACCCACTAGTCGGGTACCCCTGGAGAGTCCTGGTACCGCCCACCTGGGGCGCCTACGAATCCTAGGCCTTTGCGTGTTGATGGACGGATTCGCGCTGCAAGAAGCCTCGCCGCTGTCACCACTGGGCATTGGCAATTGGTTCAGGGGGAACGGCGCGGAACCAATGGGAGCGGTGGCTCGGGAGAGACCTTGGAGCGCGCGGGAAAGAGACCAATATAAACTGTGGCGGGATAGTTTTCGGGTCCTTGTCCAGTGA'
print(len(s))

from random import shuffle

def shuffle_sequence(seq, chunk_size):
    arr = kmer(seq, chunk_size, chunk_size)
    arr_even = [arr[i] for i in range(0, len(arr), 2)]
    arr_odds = [arr[i] for i in range(1, len(arr), 2)]

    shuffle(arr_odds)
    shuffled = []
    for i in range(len(arr)):
        if i % 2 == 0:
            shuffled.append(arr_even.pop(0))
        else:
            shuffled.append(arr_odds.pop(0))

    return ''.join(shuffled)

shuffled_s = shuffle_sequence(s, 16)
shuffled_s

512


'CTTTTCCCCATTCCATAGGTTTTCTTTTCTAAGGATGCTGTACAAAGAAGTTCAGAAACTTAGCCTAAAGGAAAGTTCTCGTTCATGCCTGGGGGAGATTTGCATCAAAAAGGCGCCTACGAATCCTATTTCACCTCCAGGAGTAGAATTGTTTTAGTATAAAGCTATAGTAACTTCTCGGGAGAGACCTTGATTCGCTGTCTTTGCTGGGGAACGGCGCGGAAGCAGGCTCATTTCTGTTTGAAGATCGTTTGTAACCCACTAGTCGGGTAGACCAATATAAACTGTGTACCGCCCACCTGGGAAGTTCTTTTAATTTTGGCCTTTGCGTGTTGACTGTCACCACTGGGCAGCAAGAAGCCTCGCCGTCCGCTTAATTTCAGGTTGGCAATTGGTTCAGGGTCCTTGTCCAGTGACCAATGGGAGCGGTGGCCCCTGGAGAGTCCTGGAGCGCGCGGGAAAGACAATCATTATGGACTTGGCGGGATAGTTTTCGTGGACGGATTCGCGCT'

In [71]:
"""
Create negative promoter sequence based on positive TATA sequence.
"""
from Bio import SeqIO

sample_tata_file = './data/epd/tata_sample.fasta'
epd_tata_path = './data/epd/tata.fasta'
sample_tata = SeqIO.parse(epd_tata_path, 'fasta')
epd_nontata_csv = './data/epd/neg_tata.csv'
header = '{},{}\n'.format('sequence', 'label')
f = {}
try:
    if os.path.exists(epd_nontata_csv):
        os.remove(epd_nontata_csv)
    f = open(epd_nontata_csv, 'x')
    f.write(header)
    for tata in sample_tata:
        seq = str(tata.seq)
        kmers = kmer(seq, 512, 1)
        
        for sub in kmers:
            # Generate negative sample from positive sample using DeePromoter method (Oubounyt et. al., 2019).
            neg_kmers = shuffle_sequence(sub, 16)
            f.write('{},{}\n'.format(neg_kmers, 0))

    f.close()
except Exception as e:
    print('error {}'.format(e))
    f.close()

In [7]:
"""
Generate csv from fasta file.
"""
from Bio import SeqIO

def generate_csv_from_fasta(src_fasta, target_csv, label):
    fasta = SeqIO.parse(src_fasta, 'fasta')
    target = {}
    if os.path.exists(target_csv):
        os.remove(target_csv)
    target = open(target_csv, 'x')
    target.write('{},{}\n'.format('sequence', 'label'))
    for f in fasta:
        seq = str(f.seq)
        kmers = kmer(seq, 512, 1)

        for sub in kmers:
            target.write('{},{}\n'.format(sub, label))

    target.close()

In [1]:
ss_src_pos_acc_fasta = './data/splice-sites/splice-deep/positive_DNA_seqs_acceptor_hs.fa'
ss_src_pos_don_fasta = './data/splice-sites/splice-deep/positive_DNA_seqs_donor_hs.fa'
ss_src_neg_acc_fasta = './data/splice-sites/splice-deep/positive_DNA_seqs_acceptor_hs.fa'
ss_src_neg_don_fasta = './data/splice-sites/splice-deep/positive_DNA_seqs_acceptor_hs.fa'

target_pos_acc_fasta = './data/splice-sites/splice-deep/pos_ss_acc.csv'
target_pos_don_fasta = './data/splice-sites/splice-deep/pos_ss_don.csv'
target_neg_acc_fasta = './data/splice-sites/splice-deep/neg_ss_acc.csv'
target_neg_don_fasta = './data/splice-sites/splice-deep/neg_ss_don.csv'

In [8]:
generate_csv_from_fasta(ss_src_pos_acc_fasta, target_pos_acc_fasta, 0)
generate_csv_from_fasta(ss_src_pos_don_fasta, target_pos_don_fasta, 0)
generate_csv_from_fasta(ss_src_neg_acc_fasta, target_neg_acc_fasta, 0)
generate_csv_from_fasta(ss_src_neg_don_fasta, target_neg_don_fasta, 0)