In [1]:
import numpy as np
from tensorflow import keras
from sklearn import preprocessing

# Load labels 

train_labels=np.load("train_labels.npy")
val_labels=np.load("val_labels.npy")
test_labels=np.load("test_labels.npy")

num_classes = len(np.unique(train_labels))
print('Total classes: ',num_classes)

# Encodes labels to categorical

le = preprocessing.LabelEncoder()
le.fit(train_labels)
train_labels_num = le.transform(train_labels)
val_labels_num = le.transform(val_labels)
test_labels_num = le.transform(test_labels)

train_labels_bin = keras.utils.to_categorical(train_labels_num, num_classes)
val_labels_bin = keras.utils.to_categorical(val_labels_num, num_classes)
test_labels_bin = keras.utils.to_categorical(test_labels_num, num_classes)



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Total classes:  88


In [2]:
# Train a model on a training set without random class

import tensorflow.keras.backend as K
from utils.ExpConfiguration import *
from utils.modelUtils import *
import numpy as np
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import f1_score
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


padd = 'new' # CHANGE HERE to select other padding schemas (new, constant, random)

testfile = 'test_rndseq_t1.fasta'
test_labels_rnd = get_labels(testfile)
seqTestRnd = get_seqs_with_bnoise(testfile,nperc=0)


y_true = np.argmax(test_labels_bin, axis=1)
y_true = le.inverse_transform(y_true)
labels,lcounts = np.unique(y_true,return_counts=True)


batch_size = 32
epochs = 10

nl = 3
bn = 0

mc = 50 # number of runs

outdata = {}

for en in seqEncoders:
    print('n. layers=',nl,' Encoder=',en['filename'],' Noise=',str(bn), ' Padding=',padd)
    train_seq=np.load('train_' + en['filename'] + '_' + padd + '_' + str(bn) + '_seq.npy')
    val_seq=np.load('val_' + en['filename'] + '_' + padd + '_' + str(bn) + '_seq.npy')
    test_seq=np.load('test_' + en['filename'] + '_' + padd + '_' + str(bn) + '_seq.npy')

    train_seq = keras.utils.to_categorical(train_seq)
    val_seq = keras.utils.to_categorical(val_seq)
    test_seq = keras.utils.to_categorical(test_seq)

    tf.keras.backend.clear_session()
    
    if (en['filename'] in ['1mer','2mer','3mer']):
        m=buildCNNModel(inshape=train_seq.shape[1:],num_classes=num_classes,nlayers=nl,cnndim=1)
    else:
        m=buildCNNModel(inshape=train_seq.shape[1:],num_classes=num_classes,nlayers=nl,cnndim=2)

    print(m.summary())

    m.compile(optimizer=keras.optimizers.Adam(lr=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    m.fit(train_seq, train_labels_bin,verbose=1,
                  batch_size=batch_size,shuffle=True,
                  epochs=epochs,#validation_split=0.33,
                  validation_data=(val_seq, val_labels_bin))
    
    pred = m.predict(test_seq, verbose=1)
    predicted = np.argmax(pred, axis=1)
    y_pred = le.inverse_transform(predicted)


    model_mc = K.function([m.input, K.learning_phase()], [m.output])
    
    # Generate random samples

    test_seq_rnd = encode_seqs(seqTestRnd,enc=en['enc'],encparam=en['param'+str(bn)],padding=padd)
    test_seq_rnd = keras.utils.to_categorical(test_seq_rnd)

    avrp_rnd = np.zeros((len(test_seq_rnd),num_classes))
    avrp_nornd = np.zeros((len(test_seq),num_classes))

    fp_rnd = np.zeros((len(test_seq_rnd),num_classes))
    fp_nornd = np.zeros((len(test_seq),num_classes))

    avrhp_rnd = np.zeros((len(test_seq_rnd)))
    avrhp_nornd = np.zeros((len(test_seq)))

    p_rnd = np.zeros((mc,len(test_seq_rnd),num_classes))
    p_nornd = np.zeros((mc,len(test_seq),num_classes))

    for i in range(mc):
        preds_nornd=model_mc([test_seq,1])
        p_nornd[i,:,:] = preds_nornd[0]
        avrp_nornd = avrp_nornd + preds_nornd[0]
        avrhp_nornd = avrhp_nornd + np.sum(-preds_nornd[0]*np.log2(preds_nornd[0]+1e-10),1)
        midx = np.argmax(preds_nornd[0],1)
        for j in range(len(test_seq)):
            fp_nornd[j,midx[j]] = fp_nornd[j,midx[j]] + 1

        preds_rnd=model_mc([test_seq_rnd,1])
        p_rnd[i,:,:] = preds_rnd[0]
        avrp_rnd = avrp_rnd + preds_rnd[0]
        avrhp_rnd = avrhp_rnd + np.sum(-preds_rnd[0]*np.log2(preds_rnd[0]+1e-10),1)
        midx = np.argmax(preds_rnd[0],1)
        for j in range(len(test_seq_rnd)):
            fp_rnd[j,midx[j]] = fp_rnd[j,midx[j]] + 1


    avrp_rnd = avrp_rnd/mc
    avrp_nornd = avrp_nornd/mc
    fp_rnd = fp_rnd/mc
    fp_nornd = fp_nornd/mc
    avrhp_rnd = avrhp_rnd/mc
    avrhp_nornd = avrhp_nornd/mc

        # compute indicators entropy (hp) variance (var) max prob (maxp) and f max

    hp_nornd = np.sum(-avrp_nornd*np.log2(avrp_nornd+1e-10),1)
    hp_rnd = np.sum(-avrp_rnd*np.log2(avrp_rnd+1e-10),1)

    var_rnd = np.var(p_rnd,0)
    var_nornd = np.var(p_nornd,0)

    orderp_rnd = np.argsort(-avrp_rnd,1)
    orderp_nornd = np.argsort(-avrp_nornd,1)

    maxp_nornd = np.max(avrp_nornd,1)
    maxp_rnd = np.max(avrp_rnd,1)
    maxfp_nornd = np.max(fp_nornd,1)
    maxfp_rnd = np.max(fp_rnd,1)
    
    indicators={}
    indicators.update({'hp_rnd' : hp_rnd})
    indicators.update({'hp_nornd' : hp_nornd})
    indicators.update({'avrhp_rnd' : avrhp_rnd})
    indicators.update({'avrhp_nornd' : avrhp_nornd})
    indicators.update({'orderp_rnd' : orderp_rnd})
    indicators.update({'orderp_nornd' : orderp_nornd})
    indicators.update({'maxp_rnd' : maxp_rnd})
    indicators.update({'maxp_nornd' : maxp_nornd})
    indicators.update({'maxfp_rnd' : maxfp_rnd})
    indicators.update({'maxfp_nornd' : maxfp_nornd})
    indicators.update({'avrp_rnd' : avrp_rnd})
    indicators.update({'avrp_nornd' : avrp_nornd})
    indicators.update({'var_rnd' : var_rnd})
    indicators.update({'var_nornd' : var_nornd})

    indicators.update({'y_pred' : y_pred})


    outdata.update({en['filename'] : indicators})


    



n. layers= 3  Encoder= 3mer  Noise= 0  Padding= new
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 67, 32)            6272      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 33, 32)            0         
_________________________________________________________________
dropout (Dropout)            (None, 33, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 33, 64)            6208      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 16, 64)            0         
___________________________________

Epoch 10/10
 Done 25342 total records
n. layers= 3  Encoder= 1mer  Noise= 0  Padding= new
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 200, 32)           512       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 32)           0         
_________________________________________________________________
dropout (Dropout)            (None, 100, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           6208      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 64)            0         
____________________________________________________

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Done 25342 total records
n. layers= 3  Encoder= Hilbert  Noise= 0  Padding= new
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 16, 16, 32)        1472      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 8, 8, 32)          0         
_________________________________________________________________
dropout (Dropout)            (None, 8, 8, 32)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 8, 8, 64)          18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 4, 4, 64)          0         
_________________________________________________________________
dropout_1 (Dropo

In [3]:
import pickle

# save results on file
f = open('results/RejectionExperiments_' + padd + '.pckl', 'wb')
pickle.dump(outdata, f)
f.close()