In [None]:
import os
import pandas as pd

from data_dir import genome_grch37, grch37_sample_dir, genome_grch37_dir
from data_dir import genome_grch38, grch38_sample_dir, genome_grch38_dir

In [None]:
def _parse_desc(desc):
    # desc_obj = {'gene': '', 'gene_id': '', 'genebank': '', 'ensembl': ''}
    desc_obj = {}
    arr = desc.split(';') # Split desc with semicolon as separator.
    for e in arr:
        det = e.split('=') # Split every parameter and its corresponding value.
        param = det[0].lower()
        val = det[1]
        desc_obj[param] = val
        if param == "dbxref":
            # Parse value of dbxref.
            # i.e. Dbxref=GeneID:653635,Genbank:NR_024540.1,HGNC:HGNC:38034
            # param = dbxref (in lowercase)
            # val = GeneID:653635,Genbank:NR_024540.1,HGNC:HGNC:38034
            dbxref_vals = val.split(',')
            for e in dbxref_vals:
                arr = e.split(':')
                dbxref_param = arr[0].lower()
                dbxref_val = arr[1]
                if dbxref_param == 'geneid':
                    desc_obj['gene_id'] = dbxref_val
                elif dbxref_param == 'genbank':
                    desc_obj['genbank'] = dbxref_val
                elif dbxref_param == 'ensembl':
                    desc_obj['ensembl'] = dbxref_val
                else:
                    break
    
    return desc_obj

def _gff_parseline(line, regions):
    if line[0] == '#':
        return False
    else:
        words = line.split('\t')
        sequence_id = words[0]
        refseq = words[1]
        region = words[2]
        start = int(words[3]) # One-based numbering.
        start_index = start-1 # Zero-based numbering.
        end = int(words[4])
        end_index = end-1
        desc = words[8] # Description.
        desc_obj = _parse_desc(desc)
        gene = desc_obj['gene'] if 'gene' in desc_obj.keys() else '' # Gene name.
        gene_id = desc_obj['gene_id'] if 'gene_id' in desc_obj.keys() else '' # Gene ID
        genbank = desc_obj['genbank'] if 'genbank' in desc_obj.keys() else '' # GeneBank
        ensembl = desc_obj['ensembl'] if 'ensembl' in desc_obj.keys() else '' # Ensembl
        if regions is None:
            return {'sequence_id': sequence_id, 'refseq': refseq, 'region': region, 'start': start, 'start_index': start_index, 'end': end, 'end_index': end_index, 'desc': desc_obj, 'gene': gene, 'gene_id': gene_id, 'genbank': genbank, 'ensembl': ensembl}
        elif region in regions:
            return {'sequence_id': sequence_id, 'refseq': refseq, 'region': region, 'start': start, 'start_index': start_index, 'end': end, 'end_index': end_index, 'desc': desc_obj, 'gene': gene, 'gene_id': gene_id, 'genbank': genbank, 'ensembl': ensembl}
        else:
            return False

def gff_to_csv(file, csv_output, regions):
    if os.path.exists(file):
        # Prepare file and dataframe.
        if os.path.exists(csv_output):
            os.remove(csv_output)
        colnames = ['sequence_id', 'refseq', 'region', 'start_index', 'end_index', 'start', 'end', 'gene', 'gene_id', 'genebank', 'ensembl']
        header = ",".join(colnames)
        f = open(file, 'r')
        out = open(csv_output, 'x')
        out.write("{} \n".format(header))
        
        for line in f:
            d = _gff_parseline(line, regions)
            try:
                if d != False:
                    if d:
                        output = "{},{},{},{},{},{},{},{},{},{},{}\n".format(d['sequence_id'], d['refseq'], d['region'], d['start_index'], d['end_index'], d['start'], d['end'], d['gene'], d['gene_id'], d['genbank'], d['ensembl'])
                        out.write(output)
                    else:
                        break
            except:
                out.close()
                f.close()

        out.close()
        f.close()

print(genome_grch37)
print(genome_grch38)
print(grch37_sample_dir)
print(grch38_sample_dir)


In [None]:
# s = "NC_000001.11	RefSeq	region	1	248956422	.	+	.	ID=NC_000001.11:1..248956422;Dbxref=taxon:9606;Name=1;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA"
# s = "NC_000001.11	BestRefSeq	exon	13221	14409	.	+	.	ID=exon-NR_046018.2-3;Parent=rna-NR_046018.2;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD/H-box helicase 11 like 1 (pseudogene);pseudo=true;transcript_id=NR_046018.2"
s = "NC_000001.11	BestRefSeq	exon	29321	29370	.	-	.	ID=exon-NR_024540.1-1;Parent=rna-NR_024540.1;Dbxref=GeneID:653635,Genbank:NR_024540.1,HGNC:HGNC:38034;gbkey=misc_RNA;gene=WASH7P;product=WASP family homolog 7%2C pseudogene;pseudo=true;transcript_id=NR_024540.1"
d = _gff_parseline(s, ['exon'])
d


In [None]:
gff_to_csv(genome_grch37, grch37_sample_dir + "/grch37_all_07012022.csv", None)

In [None]:
gff_to_csv(genome_grch38, grch38_sample_dir + "/grch38_all.csv", None)

In [None]:
gff_to_csv(genome_grch37, grch37_sample_dir + "/grch37_exon_only_06012022.csv", ['exon'])

In [None]:
gff_to_csv(genome_grch38, grch38_sample_dir + "/grch38_exon_only_06012022.csv", ['exon'])

In [None]:
colnames = ['sequence_id', 'refseq', 'region', 'start_index', 'end_index', 'start', 'end', 'gene', 'gene_id', 'genbank', 'ensembl']
header = ",".join(colnames)
header

In [None]:
def gff_to_csvs(gff_file, target_folder, regions, header):
    f = open(gff_file)
    target_file = target_folder + '/'
    cur_seq = ""
    temp_seq = ""
    output_file = ""
    file_to_write = {}
    for line in f:
        d = _gff_parseline(line, regions)
        if d:
            output = "{},{},{},{},{},{},{},{},{},{},{} \n".format(d['sequence_id'], d['refseq'], d['region'], d['start_index'], d['end_index'], d['start'], d['end'], d['gene'], d['gene_id'], d['genbank'], d['ensembl'])
            temp_seq = d['sequence_id']
            if cur_seq == "":
                cur_seq = temp_seq

            # Prepare desired file to write.
            output_file = target_file + temp_seq + '.csv'

            # Compare if this sequence_id is the as previous sequence_id.
            if temp_seq == cur_seq:

                # If it is then write to desired file.
                # Check if file exists. If not then create file.
                if os.path.exists(output_file):
                    file_to_write.write(output)
                else:
                    file_to_write = open(output_file, 'x')

                    # Write header first.
                    file_to_write.write("{}\n".format(header))
                    file_to_write.write(output)
            
            # If this sequence_id is not the same as previous sequence_id, close the existing file.
            elif cur_seq != temp_seq:
                file_to_write.close()
                cur_seq = temp_seq

    # Close any file related to this procedure.
    file_to_write.close()
    f.close()                


In [None]:
gff_to_csvs(genome_grch38, './sample/grch38/genes', None, header)


In [None]:
gff_to_csvs(genome_grch37, './sample/grch37/genes', None, header)

In [None]:
from Bio import SeqIO
from data_dir import hs_nc1

print(hs_nc1)

In [None]:
"""
    complete_sequence_file  : path of complete sequence file in FASTA format.
    label_location_file     : path of file containing exon region in CSV.
"""
def generate_labels(complete_sequence_file):
    print("reading complete sequence at {} \n".format(complete_sequence_file))
    seq = SeqIO.parse(complete_sequence_file, "fasta")
    return seq

seq = generate_labels(hs_nc1)
complete_sequence = list(seq)[0].seq
complete_labels = ['.' if a != 'N' else 'N' for a in complete_sequence]

In [None]:
from data_dir import chr1
print(chr1)

In [None]:
"""
Char 'N' represents any base. It indicates that sequence has no information regarding the base at that position.
Char '.' represents any feature other than Exon. Char 'E' is exon.
"""
# Try open the csv using pandas.
import pandas as pd

df = pd.read_csv('./sample/grch38/genes/NC_000001.11.csv')
df.head(3)
df[df['region'] == 'region'].loc[0]['end_index']

In [None]:
# Get all gene in dataframe.
genes = df['gene'].unique()
genes = list(genes)
genes = [a for a in genes if str(a) != 'nan']
print('how many gene? {}'.format(len(genes)))
# print(genes)
genes[0]
ndf = df[df['gene'] == genes[0]]
#ndf = ndf.loc[ndf['region'].isin(['gene', 'pseudogene'])]
ndf.iloc[0]['region']
ndf = ndf[ndf['region'] == 'exon']
ndf

In [None]:
for g in genes:
    # Filter dataframe to contain certain genbank.
    ndf = df[df['gene'] == g]

    g_region_df = ndf.loc[ndf['region'].isin(['gene', 'pseudogene'])]
    g_start_index = g_region_df.iloc[0]['start_index']
    g_end_index = g_region_df.iloc[0]['end_index']

    # Prepare sequence and its label.
    g_sequence = complete_sequence[g_start_index:g_end_index+1]
    g_label = ['N' if a == 'N' else '.' for a in g_sequence]

    # Generate labels from this dataframe.
    try:
        exons = ndf[ndf['region'] == 'exon']
        for i, row in exons.iterrows():
            s = row['start_index']
            e = row['end_index']

            for j in range(s, e+1):
                rel_index = j-g_start_index
                g_label[rel_index] = 'E' if g_label[rel_index] != 'N' else 'N'

        fname = fname = './sample/grch38/labels/{}.txt'.format(g)
        g_file = open(fname, 'x')
        g_file.write('{}\n{}\n'.format(g_sequence, "".join(g_label)))
        g_file.close()
    except IndexError:
        print('gene {}, length {}'.format(g, len(g_sequence)))
        print('gene region {}-{}'.format(g_start_index, g_end_index))




In [None]:
label_seq1 = [c for c in complete_labels]

for i, row in df.iterrows():
    start_index = row['start_index']
    end_index = row['end_index']
    # print("region {}-{}".format(start_index, end_index))
    for j in range(start_index, end_index+1):
        label_seq1[j] = 'E'

In [None]:
def split_string(s, length):
    return (s[0+i:length+i] for i in range(0, len(s), length))

arr_label = split_string(label_seq1, 50)
f = open('chr1.label.txt', 'x')
for label in arr_label:
    f.write("{}\n".format("".join(label)))
f.close()

In [None]:
for i in range(10):
    print(i)

In [None]:
s = "ATCGATGCAGCAGACGACAGCATCAGCATCGACTCGACGATCGACTGACTGACTGACTGAC"
print('len s {}'.format(len(s)))
def _get_kmer(sequence, k):
    lenseq = len(sequence)
    if (lenseq > 0 and k > 0):
        arr = [sequence[i:i+k] for i in range(lenseq+1-k)]
        return arr
    else:
        return []

kmers = _get_kmer(s, 3)
kmers

In [None]:
"""
Process sequence into kmers.
Create read by window for certain window size.
@param sequence : a sequence.
@param k_size : size of kmer.
@param t_size : size of substring to be read from sequence.
@window_size : size of sliding.
"""
def _read_by_window(sequence, k_size, t_size, window_size):
    kmers = _get_kmer(sequence, k_size)
    len_kmers = len(kmers)
    reads = [kmers[i:i+t_size] for i in range(0, len_kmers+1, window_size)]
    return reads

reads = _read_by_window(s, 3, 4, 2)
reads

In [None]:
from os import listdir
from os.path import isfile, basename
import logging
import traceback

dirpath = './sample/grch38/labels'
files = [(dirpath + '/' + a) for a in listdir(dirpath) if isfile(dirpath + '/' + a)]
files = [a for a in files if isfile(a)] # files containing path of files.


In [None]:
kmer_size = 3
t_size = 512
window_size = 256
window_sizes = [64, 128, 256]
for window_size in window_sizes:
    for fpath in files:
        try:
            print('opening file {}'.format(fpath))
            f = open(fpath)
            filename = basename(fpath)
            filename = filename.split('.')
            extension = filename[1]
            filename = filename[0]
            t = {}
            lines = f.readlines()
            sequence = lines[0]
            labels = lines[1]
            if len(sequence) == len(labels): # Make sure that sequence and labels are compatible. If their size is different then something wrong with labelling process above.
                seq_reads = _read_by_window(sequence, kmer_size, t_size, window_size)
                label_reads = _read_by_window(labels, kmer_size, t_size, window_size)

                tpath ="{}/{}.k{}.t{}.w{}.csv".format(dirpath, filename, kmer_size, t_size, window_size)
                if os.path.exists(tpath):
                    os.remove(tpath)
                t = open(tpath, 'x')
                t_header = ','.join(['kmers', 'labels'])
                t.write('{}\n'.format(t_header))
                for j in range(len(seq_reads)):
                    seqread = ';'.join(seq_reads[j])
                    labelread = ';'.join(label_reads[j])
                    entry = ','.join([seqread, labelread])
                    t.write('{}\n'.format(entry))
                t.close()
                f.close()
        except Exception as e:
            logging.error(traceback.format_exc())
            logging.error('closing {}'.format(f.name))
            logging.error('closing {}'.format(t.name))
            t.close()
            f.close()

In [None]:
"""
Creates promoter dataset.
"""
hs_promoter_TATA = './data/promoter/deepromoter/hs_pos_TATA.txt'
mm_promoter_TATA = './data/promoter/deepromoter/mm_pos_TATA.txt'
hs_promoter_non_TATA = './data/promoter/deepromoter/hs_pos_nonTATA.txt'
mm_promoter_non_TATA = './data/promoter/deepromoter/mm_pos_nonTATA.txt'

positive_promoter = [hs_promoter_TATA]
negative_promoter = [hs_promoter_non_TATA]

# Create new dataset file.
pos_promoter_dataset_file = './dataset/promoter/pos_prom_dataset.csv'
neg_promoter_dataset_file = './dataset/promoter/neg_prom_dataset.csv'
if os.path.exists(pos_promoter_dataset_file):
    os.remove(pos_promoter_dataset_file)

header = ','.join(['sequence', 'label'])
t = open(pos_promoter_dataset_file, 'x')
t.write('{}\n'.format(header))

# Generating promoter positive dataset.
for ps in positive_promoter:
    f = {}
    try:
        f = open(ps, 'r')
        for line in f:
            line = line.strip()
            t.write('{},{}\n'.format(line, 1))
        f.close()
    except Exception as e:
        print('Error {}'.format(e))
        t.close()
        f.close()

t.close()
if os.path.exists(neg_promoter_dataset_file):
    os.remove(neg_promoter_dataset_file)

t = open(neg_promoter_dataset_file, 'x')
t.write('{}\n'.format(header))

# Generating promoter negative dataset.
for ns in negative_promoter:
    f = {}
    try:
        f = open(ns, 'r')
        for line in f:
            line = line.strip()
            t.write('{},{}\n'.format(line, 0))
        f.close()
    except Exception as e:
        print('Error {}'.format(e))
        t.close()
        f.close()

t.close()

In [None]:
"""
Rebalance promoter dataset by selecting smallest count between positive and negative dataset.
"""
import pandas as pd

pos_df = pd.read_csv(pos_promoter_dataset_file)
neg_df = pd.read_csv(neg_promoter_dataset_file)

pos_count = len(pos_df)
neg_count = len(neg_df)

print('positive count {}, negative count {}'.format(pos_count, neg_count))
count = pos_count if pos_count < neg_count else neg_count
print('select count = {}'.format(count))

sample_prom_pos_df = pos_df.sample(n=count, random_state=1)
sample_prom_neg_df = neg_df.sample(n=count, random_state=1)

print('dataset size pos {} neg {}'.format(sample_prom_pos_df.size, sample_prom_neg_df.size))
sample_prom_pos_df.head(10)

sample_prom_pos_df.to_csv('./dataset/promoter/sample_pos_dataset.csv')
sample_prom_neg_df.to_csv('./dataset/promoter/sample_neg_dataset.csv')

sample_prom_df = sample_prom_pos_df.append(sample_prom_neg_df)
sample_prom_df.to_csv('./dataset/promoter/sample_prom_dataset.csv', index=False)

In [1]:
"""
Split promoter data to train, validation, and test set.
Set fraction 0.8, 0.1, and 0.1 for train, validation, and test set.
"""
prom_dataset_path = './dataset/promoter'
pos_prom_path = '{}/pos_prom_dataset.csv'.format(prom_dataset_path)
neg_prom_path = '{}/neg_prom_dataset.csv'.format(prom_dataset_path)

import pandas as pd

pos_prom_df = pd.read_csv(pos_prom_path)
neg_prom_df = pd.read_csv(neg_prom_path)

In [2]:
pos_prom_df.head(3)

Unnamed: 0,sequence,label
0,CTCCACTTTTTCTCACGTTTATCTGAGCGAAAACAAGCACGGTTCG...,1
1,CCAGCAGATGGAAAACAGGACAATGTAACACTGTTCTTATCATCAC...,1
2,TCAGAGAAACTGGTCTCTTGATAATAGCCATAGATTACATACTGTG...,1


In [3]:
neg_prom_df.head(3)

Unnamed: 0,sequence,label
0,AGCCACGTGAGTCGCTGGGCTATGGGTGGTGGTGGGGGTGAGGGAG...,0
1,TACTTCCGGTTTCCACGGAGCTCCGCCCCTTAGGGGGGTTCTCGCT...,0
2,GGAGCCTGGTAGGGAGGACAAATCTCTCGAAATCTCAGTTGGCGCT...,0


In [6]:
"""
Generate promoter data sample for model training, validation, and testing.
"""
sample_pos_prom_df = pos_prom_df
sample_neg_prom_df = neg_prom_df
sample_pos_prom_train_df = sample_pos_prom_df.sample(n=10, random_state=1337)
sample_neg_prom_train_df = sample_neg_prom_df.sample(n=10, random_state=1337)
sample_pos_prom_df = sample_pos_prom_df.drop(sample_pos_prom_train_df.index)
sample_neg_prom_df = sample_neg_prom_df.drop(sample_neg_prom_train_df.index)
sample_pos_prom_test_df = sample_pos_prom_df.sample(n=5, random_state=1337)
sample_neg_prom_test_df = sample_neg_prom_df.sample(n=5, random_state=1337)

sample_prom_train_df = sample_pos_prom_train_df.append(sample_neg_prom_train_df)
sample_prom_test_df = sample_pos_prom_test_df.append(sample_neg_prom_test_df)

print('train {} test {}'.format(len(sample_prom_train_df), len(sample_prom_test_df)))

sample_prom_train_df.to_csv('./dataset/promoter/sample_promoter_train.csv', index=False)
sample_prom_test_df.to_csv('./dataset/promoter/sample_promoter_test.csv', index=False)

train 20 test 10


In [None]:
"""
Generate poly-A dataset from human data only.
Poly-A data is concluded from DeeReCT-PolyA and the DeeReCT-PolyA model uses 5-fold cross validation.
Three of them for training, one for validation, and one for testing.
For this section, first, second, and third fold are used for training; fourth for validation; and fifth for testing.

DeeReCT-PolyA uses multiple datasets: dragon human (Kalkatawi et. al., 2012) and Omni human (Magana-Mora et. al., 2017).
Omni dataset is chosen because it's relatively new (2017 vs 2012) and contains more data (Xia et. al., 2018).
"""
dragon_human_pos_dir = './data/poly-a/deerectpolya/human/dragon_polyA_data/positive5fold'
dragon_human_neg_dir = './data/poly-a/deerectpolya/human/dragon_polyA_data/negative5fold'
omni_human_pos_dir = './data/poly-a/deerectpolya/human/omni_polyA_data/positive'
omni_human_neg_dir = './data/poly-a/deerectpolya/human/omni_polyA_data/negative'

from os import listdir
from os.path import isfile, basename

pos_dir = omni_human_pos_dir
neg_dir = omni_human_neg_dir
pos_files = listdir(pos_dir)
pos_files = ['{}/{}'.format(pos_dir, a) for a in listdir(pos_dir) if isfile('{}/{}'.format(pos_dir, a))]
neg_files = listdir(neg_dir)
neg_files = ['{}/{}'.format(neg_dir, a) for a in listdir(neg_dir) if isfile('{}/{}'.format(neg_dir, a))]

#print(len(pos_files))
#print(len(neg_files))

dataset_dir = './dataset/poly-a'
pos_dataset_path = '{}/pos_polya.csv'.format(dataset_dir)
neg_dataset_path = '{}/neg_polya.csv'.format(dataset_dir)

files = [(pos_files, pos_dataset_path, '1'), (neg_files, neg_dataset_path, '0')]
for p in files:
    fs = p[0]
    dataset_path = p[1]
    label = p[2]

    if os.path.exists(dataset_path):
        os.remove(dataset_path)
    t = open(dataset_path, 'x')
    t.write('{}\n'.format(','.join(['sequence', 'label'])))
    for fpath in fs:
        f = {}
        try:
            f = open(fpath, 'r')
            for line in f:
                line = line.strip()
                t.write('{},{}\n'.format(line, label))
        except Exception as e:
            print('Error {}'.format(e))
            f.close()
        finally:
            f.close()

    t.close()

In [None]:
"""
Split positive and negative data into three parts for training, validation, and test set.
Process both data using pandas.
"""
import pandas as pd

pos_df = pd.read_csv(pos_dataset_path)
neg_df = pd.read_csv(neg_dataset_path)
pos_df_size = len(pos_df)
neg_df_size = len(neg_df)

if pos_df_size == neg_df_size:
    print('both are {}. data balance.'.format(pos_df_size))
else:
    count = pos_df_size if pos_df_size < neg_df_size else neg_df_size
    print('data imbalance at pos = {} and neg = {}.\nSelect count = {}.'.format(pos_df_size, neg_df_size, count))

# Split positive data into three parts.
pos_train_df = pos_df.sample(frac=0.8, replace=False, random_state=1)
pos_val_df = pos_df.drop(pos_train_df.index)
pos_test_df = pos_val_df.sample(frac=0.5, replace=False, random_state=1)
pos_val_df = pos_val_df.drop(pos_test_df.index)

pos_train_df.to_csv('{}/pos_polya_train.csv'.format(dataset_dir), index=False)
pos_val_df.to_csv('{}/pos_polya_val.csv'.format(dataset_dir), index=False)
pos_test_df.to_csv('{}/pos_polya_test.csv'.format(dataset_dir), index=False)

print('pos train set {}, pos validation set {}, pos test set {}'.format(len(pos_train_df), len(pos_val_df), len(pos_test_df)))

# Split negative data into three parts.
neg_train_df = neg_df.sample(frac=0.8, replace=False, random_state=1)
neg_val_df = neg_df.drop(neg_train_df.index)
neg_test_df = neg_val_df.sample(frac=0.5, replace=False, random_state=1)
neg_val_df = neg_val_df.drop(neg_test_df.index)

neg_train_df.to_csv('{}/neg_polya_train.csv'.format(dataset_dir), index=False)
neg_val_df.to_csv('{}/neg_polya_val.csv'.format(dataset_dir), index=False)
neg_test_df.to_csv('{}/neg_polya_test.csv'.format(dataset_dir), index=False)

print('neg train set {}, neg validation set {}, neg test set {}'.format(len(neg_train_df), len(neg_val_df), len(neg_test_df)))

# Merge each of train, validation, and test set.
train_df = pos_train_df.append(neg_train_df)
val_df = pos_val_df.append(neg_val_df)
test_df = pos_test_df.append(neg_test_df)

train_df.to_csv('{}/polya_train.csv'.format(dataset_dir), index=False)
val_df.to_csv('{}/polya_val.csv'.format(dataset_dir), index=False)
test_df.to_csv('{}/polya_test.csv'.format(dataset_dir), index=False)


In [10]:
"""
Create sample of poly-a training and test data.
"""
import pandas as pd

pos_polya_train_df = pd.read_csv('./dataset/poly-a/pos_polya_train.csv')
neg_polya_train_df = pd.read_csv('./dataset/poly-a/neg_polya_train.csv')
sample_pos_polya_train_df = pos_polya_train_df.sample(n=10, random_state=1337)
sample_neg_polya_train_df = neg_polya_train_df.sample(n=10, random_state=1337)
sample_pos_polya_train_df.append(sample_neg_polya_train_df).to_csv('./dataset/poly-a/sample_polya_train.csv', index=False)

pos_polya_test_df = pd.read_csv('./dataset/poly-a/pos_polya_test.csv')
neg_polya_test_df = pd.read_csv('./dataset/poly-a/neg_polya_test.csv')
sample_pos_polya_test_df = pos_polya_test_df.sample(n=5, random_state=1337)
sample_neg_polya_test_df = neg_polya_test_df.sample(n=5, random_state=1337)
sample_pos_polya_test_df.append(sample_neg_polya_test_df).to_csv('./dataset/poly-a/sample_polya_test.csv', index=False)

In [None]:
"""
Create splice-site dataset.
"""
from os.path import basename

ss_dir = './data/splice-sites/splice-deep/'
pos_acc_ss_hs = '{}/positive_DNA_seqs_acceptor_hs.fa'.format(ss_dir)
pos_don_ss_hs = '{}/positive_DNA_seqs_donor_hs.fa'.format(ss_dir)
neg_acc_ss_hs = '{}/negative_DNA_seqs_acceptor_hs.fa'.format(ss_dir)
neg_don_ss_hs = '{}/negative_DNA_seqs_donor_hs.fa'.format(ss_dir)

ss_dataset_dir = './dataset/splice-sites'
pos_ss_acc_dataset = '{}/pos_ss_acc_hs.csv'.format(ss_dataset_dir)
pos_ss_don_dataset = '{}/pos_ss_don_hs.csv'.format(ss_dataset_dir)
neg_ss_acc_dataset = '{}/neg_ss_acc_hs.csv'.format(ss_dataset_dir)
neg_ss_don_dataset = '{}/neg_ss_don_hs.csv'.format(ss_dataset_dir)


In [None]:
files = [(pos_acc_ss_hs, 1, 'acc', pos_ss_acc_dataset), 
            (pos_don_ss_hs, 1, 'don', pos_ss_don_dataset), 
            (neg_acc_ss_hs, 0, 'acc', neg_ss_acc_dataset), 
            (neg_don_ss_hs, 0, 'don', neg_ss_don_dataset)]
for p in files:
    fname = p[0]
    label = p[1]
    acc_don = p[2]
    dataset_path = p[3]

    f = {}
    t = {}
    if os.path.exists(dataset_path):
        os.remove(dataset_path)
    try:
        f = open(fname, 'r')
        t = open(dataset_path, 'x')
        t.write('{}\n'.format(','.join(['sequence', 'label'])))

        for line in f:
            line = line.strip()
            t.write('{},{}\n'.format(line, label))
        t.close()
        f.close()
    except Exception as e:
        print('Error {}'.format(e))
        t.close()
        f.close()


In [None]:
"""
Create train, validation, and test set for splice site. To do that, the data need to be balance.
If not the sampling based on smallest count is required. Processing is done using pandas.
"""
import pandas as pd

pos_ss_acc_df = pd.read_csv(pos_ss_acc_dataset)
pos_ss_don_df = pd.read_csv(pos_ss_don_dataset)
neg_ss_acc_df = pd.read_csv(neg_ss_acc_dataset)
neg_ss_don_df = pd.read_csv(neg_ss_don_dataset)

print('{}\n{}\n{}\n{}'.format(pos_ss_acc_dataset, pos_ss_don_dataset, neg_ss_acc_dataset, neg_ss_don_dataset))


In [None]:
pos_ss_acc_df.head(10)

In [None]:
pos_ss_don_df.head(5)

In [None]:
neg_ss_acc_df.head(5)

In [None]:
neg_ss_don_df.head(5)

In [None]:
# Because loading the dataframe is time consuming, leave the loading at cell above and do later processing here.
pos_ss_acc_size = len(pos_ss_acc_df)
pos_ss_don_size = len(pos_ss_don_df)
neg_ss_acc_size = len(neg_ss_acc_df)
neg_ss_don_size = len(neg_ss_don_df)

count = 0
if pos_ss_acc_size == pos_ss_don_size == neg_ss_acc_size == neg_ss_don_size:
    print('dataset balance')
    count = pos_ss_acc_size
else:
    print('dataset imbalance')
    print('pos acc {}\npos don {}\nneg acc {}\nneg don {}'.format(pos_ss_acc_size, pos_ss_don_size, neg_ss_acc_size, neg_ss_don_size))
    count = min([pos_ss_acc_size, pos_ss_don_size, neg_ss_acc_size, neg_ss_don_size])
    print('count = {}'.format(count))

pos_ss_acc_df_sample = pos_ss_acc_df.sample(n=count, replace=False, random_state=1337)
pos_ss_don_df_sample = pos_ss_don_df.sample(n=count, replace=False, random_state=1337)
neg_ss_acc_df_sample = neg_ss_acc_df.sample(n=count, replace=False, random_state=1337)
neg_ss_don_df_sample = neg_ss_don_df.sample(n=count, replace=False, random_state=1337)


In [None]:
# Merge all training, validation, and test data to single file for each dataset.
# Take train, validation, and test at 8:1:1 ratio.
dfs = [(pos_ss_acc_df_sample, 'pos_ss_acc'), (pos_ss_don_df_sample, 'pos_ss_don'), (neg_ss_acc_df_sample, 'neg_ss_acc'), (neg_ss_don_df_sample, 'neg_ss_don')]
training_df = pd.DataFrame(columns = ['sequence', 'label'])
validation_df = pd.DataFrame(columns = ['sequence', 'label'])
testing_df = pd.DataFrame(columns = ['sequence', 'label'])
for p in dfs:
    df = p[0]
    fname = p[1]
    train_df = df.sample(frac=0.8, random_state=1337)
    val_df = df.drop(train_df.index)
    test_df = val_df.sample(frac=0.5, random_state=37)
    val_df = val_df.drop(test_df.index)

    # try:
    train_df.to_csv('{}/{}_train.csv'.format(ss_dataset_dir, fname), index=False)
    training_df = training_df.append(train_df)
    val_df.to_csv('{}/{}_val.csv'.format(ss_dataset_dir, fname), index=False)
    validation_df = validation_df.append(val_df)
    test_df.to_csv('{}/{}_test.csv'.format(ss_dataset_dir, fname), index=False)
    testing_df = testing_df.append(test_df)

    #except Exception as e:
    #    print('Error {}'.format(e))

training_df.to_csv('{}/ss_train.csv'.format(ss_dataset_dir), index=False)
validation_df.to_csv('{}/ss_val.csv'.format(ss_dataset_dir), index=False)
testing_df.to_csv('{}/ss_test.csv'.format(ss_dataset_dir), index=False)

In [36]:
"""
Create sample splice sites training and test data.
"""

import pandas as pd

ss_train_path = './dataset/splice-sites/ss_train.csv'
ss_test_path = './dataset/splice-sites/ss_test.csv'

ss_train_df = pd.read_csv(ss_train_path)
ss_test_df = pd.read_csv(ss_test_path)
pos_sample_ss_train_df = ss_train_df[ss_train_df['label'] == 1].sample(n=10, random_state=1337)
neg_sample_ss_train_df = ss_train_df[ss_train_df['label'] == 0].sample(n=10, random_state=1337)

pos_sample_ss_test_df = ss_test_df[ss_test_df['label'] == 1].sample(n=5, random_state=1337)
neg_sample_ss_test_df = ss_test_df[ss_test_df['label'] == 0].sample(n=5, random_state=1337)

sample_ss_train_df = pos_sample_ss_train_df.append(neg_sample_ss_train_df)
sample_ss_test_df = pos_sample_ss_test_df.append(neg_sample_ss_test_df)
sample_ss_train_df.to_csv('./dataset/splice-sites/sample_ss_train.csv', index=False)
sample_ss_test_df.to_csv('./dataset/splice-sites/sample_ss_test.csv', index=False)


In [35]:
"""
Generate sample training and testing data from promoter, splice-sites, and poly-a.
"""
import pandas as pd

cols = ['sequence', 'label_prom', 'label_ss', 'label_polya']
header = ','.join(cols)

prom_train_sample = './dataset/promoter/sample_promoter_train.csv'
prom_train_sample_df = pd.read_csv(prom_train_sample)
prom_test_sample = './dataset/promoter/sample_promoter_test.csv'
prom_test_sample_df = pd.read_csv(prom_test_sample)
ss_train_sample = './dataset/splice-sites/sample_ss_train.csv'
ss_train_sample_df = pd.read_csv(ss_train_sample)
ss_test_sample = './dataset/splice-sites/sample_ss_test.csv'
ss_test_sample_df = pd.read_csv(ss_test_sample)
polya_train_sample = './dataset/poly-a/sample_polya_train.csv'
polya_train_sample_df = pd.read_csv(polya_train_sample)
polya_test_sample = './dataset/poly-a/sample_polya_test.csv'
polya_test_sample_df = pd.read_csv(polya_test_sample)

training_sample = pd.DataFrame(columns=cols)
testing_sample = pd.DataFrame(columns=cols)

# Append prom_train_sample.
for i, r in prom_train_sample_df.iterrows():
    row = {
        'sequence': r['sequence'],
        'label_prom': r['label'],
        'label_ss': 0,
        'label_polya': 0
    }
    training_sample = training_sample.append(row, ignore_index=True)

for i, r in ss_train_sample_df.iterrows():
    row = {
        'sequence': r['sequence'],
        'label_prom': 0,
        'label_ss': r['label'],
        'label_polya': 0
    }
    training_sample = training_sample.append(row, ignore_index=True)

for i, r in polya_train_sample_df.iterrows():
    row = {
        'sequence': r['sequence'],
        'label_prom': 0,
        'label_ss': 0,
        'label_polya': r['label']
    }
    training_sample = training_sample.append(row, ignore_index=True)


# Append prom_test_sample.
for i, r in prom_test_sample_df.iterrows():
    # print('appending {} {}'.format(r['sequence'], r['label']))
    row = {
        'sequence': r['sequence'],
        'label_prom': r['label'],
        'label_ss': 0,
        'label_polya': 0,
    }
    testing_sample = testing_sample.append(row, ignore_index=True)

for i, r in ss_test_sample_df.iterrows():
    row = {
        'sequence': r['sequence'],
        'label_prom': 0,
        'label_ss': r['label'],
        'label_polya': 0
    }
    testing_sample = testing_sample.append(row, ignore_index=True)

for i, r in polya_test_sample_df.iterrows():
    row = {
        'sequence': r['sequence'],
        'label_prom': 0,
        'label_ss': 0,
        'label_polya': r['label'],
    }
    testing_sample = testing_sample.append(row, ignore_index=True)

training_sample.to_csv('./sample/training_sample.csv', index=False)
testing_sample.to_csv('./sample/testing_sample.csv', index=False)