# Dataset Preparation

## Generate Rfam novel sequences with boundary noise

In [1]:
from seqEncoders import *
from ExpConfiguration import *
#%run seqEncoders

# Run datasets/Rfam-novel/dataset-preparation.R frst to generate fasta files

ltrain = get_labels(fastaTrain)
np.save('train_labels', ltrain)
lval = get_labels(fastaVal)
np.save('val_labels', lval)
ltest = get_labels(fastaTest)
np.save('test_labels', ltest)


# Generate sequences with boundary noise ranging among 0,25,50,75,100 percents of sequence length.
for bn in bnoise:
    print("Noise = ", str(bn))
    seqTrain = get_seqs_with_bnoise(fastaTrain,nperc=bn)
    write_seqs('train_seq_bn'+str(bn)+'.fasta', seqTrain, ltrain)
    seqVal = get_seqs_with_bnoise(fastaVal,nperc=bn)
    write_seqs('val_seq_bn'+str(bn)+'.fasta', seqVal, lval)
    seqTest = get_seqs_with_bnoise(fastaTest,nperc=bn)
    write_seqs('test_seq_bn'+str(bn)+'.fasta', seqTest, ltest)       
        
        
        

Noise =  0
Noise =  25
Noise =  50
Noise =  75
Noise =  100


## Generate encoded Rfam novel sequences and applies the corresponding padding schema

In [None]:
from seqEncoders import *
from ExpConfiguration import *
#%run seqEncoders

# For each sequence with boundary noise bn a padding is added and an encoder is applied

for p in padds:
    for bn in bnoise:
        seqTrain = get_seqs_with_bnoise('train_seq_bn'+str(bn)+'.fasta',nperc=0)
        seqVal = get_seqs_with_bnoise('val_seq_bn'+str(bn)+'.fasta',nperc=0)
        seqTest = get_seqs_with_bnoise('test_seq_bn'+str(bn)+'.fasta',nperc=0)
        for en in seqEncoders:
            print("Padding = ",p,", Noise = ", str(bn),", Encoder = ",en['filename'])
            train_seq = encode_seqs(seqTrain,enc=en['enc'],encparam=en['param'+str(bn)],padding=p)
            val_seq = encode_seqs(seqVal,enc=en['enc'],encparam=en['param'+str(bn)],padding=p)
            test_seq = encode_seqs(seqTest,enc=en['enc'],encparam=en['param'+str(bn)],padding=p)
            np.save('train_' +  en['filename'] + '_' + p + "_" + str(bn) + '_seq', train_seq)
            np.save('val_' + en['filename'] + '_' + p + "_" + str(bn) +'_seq', val_seq)
            np.save('test_' + en['filename'] + '_' + p + "_" + str(bn) +'_seq', test_seq)



## Generate encoded RNAGCN/nRC sequences and applies the corresponding padding schema

In [None]:
# Generate sequences from RNAGCN nRC datasets
from ExpConfiguration import *
from seqEncoders import *
# redefine encoders to adapt to new sequence lengths
seqEncoders = (
               {'enc' : seq2Snake,
                'filename' : 'Snake',
                'param0' : [[28,28],['A','T','C','G']]},
               {'enc' : seq2Hilbert,
                'filename' : 'Hilbert',
                'param0' : [[32,32],['A','T','C','G']]},
               {'enc' : seq2Morton,
                'filename' : 'Morton',
                'param0' : [[32,32],['A','T','C','G']]},
               {'enc' : seq2Kmer,
                'filename' : '1mer',
                'param0' : [[770],1,['A','T','C','G']]},
               {'enc' : seq2Kmer,
                'filename' : '2mer',
                'param0' : [[385],2,['A','T','C','G']]},
               {'enc' : seq2Kmer,
                'filename' : '3mer',
                'param0' : [[257],3,['A','T','C','G']]}
               )

fastaTrain='datasets/nRC-public/dataset_nRC_train.fasta'
fastaTest='datasets/nRC-public/dataset_nRC_test.fasta'

ltrain = get_labels(fastaTrain)
np.save('dataset_nRC_train_labels', ltrain)
ltest = get_labels(fastaTest)
np.save('dataset_nRC_test_labels', ltest)


seqTrain = get_seqs_with_bnoise(fastaTrain,nperc=0)
write_seqs('dataset_nRC_train_seq.fasta', seqTrain, ltrain)
seqTest = get_seqs_with_bnoise(fastaTest,nperc=0)
write_seqs('dataset_nRC_test_seq.fasta', seqTest, ltest)       
        

# Generate encoded sequences with padding among new,constant and random
for p in padds:
    for en in seqEncoders:
        print("Padding = ",p,", Encoder = ",en['filename'])
        print(en['filename'],p)
        train_seq = encode_seqs(seqTrain,enc=en['enc'],encparam=en['param0'],padding=p)
        test_seq = encode_seqs(seqTest,enc=en['enc'],encparam=en['param0'],padding=p)
        np.save('dataset_nRC_train_' + en['filename'] + '_' + p + '_seq', train_seq)
        np.save('dataset_nRC_test_' + en['filename'] + '_' + p + '_seq', test_seq)


