In [1]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import pandas as pd
import keras
import tensorflow as tf
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM,Bidirectional
from keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,mean_squared_error
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical,plot_model
from sklearn.utils import resample
from IPython.display import clear_output
import matplotlib.patches as mpatches
from sklearn.preprocessing import MinMaxScaler
from keras.wrappers.scikit_learn import KerasClassifier
np.random.seed(0)

Using TensorFlow backend.


In [2]:
#data=pd.read_csv("/Volumes/Maxtor/fakeproteins/fakeresmerge2.csv",delimiter=",")
data=pd.read_csv("/Volumes/Maxtor/firstrain.csv",delimiter=",")
init=data['peptides'].values
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(init)
x= np.asarray(tokenizer.texts_to_sequences(init))
xo=x
y=np.asarray(data['NB'])
x=to_categorical(x,num_classes=22)
y=to_categorical(y,num_classes=2)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [3]:
class CollectOutputAndTarget(Callback):
    def __init__(self):
        super(CollectOutputAndTarget, self).__init__()
        self.targets = []  # collect y_true batches
        self.outputs = []  # collect y_pred batches
        self.inputs= []

        # the shape of these 2 variables will change according to batch shape
        # to handle the "last batch", specify `validate_shape=False`
        self.var_y_true = tf.Variable(0., validate_shape=False)
        self.var_y_pred = tf.Variable(0., validate_shape=False)
        self.var_x = tf.Variable(0., validate_shape=False)

    def on_batch_end(self, batch, logs=None):
        # evaluate the variables and save them into lists
        self.targets.append(K.eval(self.var_y_true))
        self.outputs.append(K.eval(self.var_y_pred))
        if len(self.inputs)>1:
            print(np.array_equal(self.inputs[-1],self.validation_data[0]))
        self.inputs.append(self.validation_data[0])
        #print(K.eval(self.var_y_true))
        #print(self.inputs)
        #print('Pred:')
        
class AccLossPlotter(Callback):
    """Plot training Accuracy and Loss values on a Matplotlib graph. 
    The graph is updated by the 'on_epoch_end' event of the Keras Callback class
    # Arguments
        graphs: list with some or all of ('acc', 'loss')
        save_graph: Save graph as an image on Keras Callback 'on_train_end' event 
    """

    def __init__(self, graphs=['acc', 'loss'], save_graph=True):
        self.graphs = graphs
        self.num_subplots = len(graphs)
        self.save_graph = save_graph


    def on_train_begin(self, logs={}):
        self.acc = []
        self.val_acc = []
        self.loss = []
        self.val_loss = []
        self.epoch_count = 0
        plt.ion()
        plt.show()


    def on_epoch_end(self, epoch, logs={}):
        self.epoch_count += 1
        self.val_acc.append(logs.get('val_acc'))
        self.acc.append(logs.get('acc'))
        self.loss.append(logs.get('loss'))
        self.val_loss.append(logs.get('val_loss'))
        epochs = [x for x in range(self.epoch_count)]

        count_subplots = 0
        
        if 'acc' in self.graphs:
            count_subplots += 1
            plt.subplot(self.num_subplots, 1, count_subplots)
            plt.title('Accuracy')
            #plt.axis([0,100,0,1])
            plt.plot(epochs, self.val_acc, color='r')
            plt.plot(epochs, self.acc, color='b')
            plt.ylabel('accuracy')

            red_patch = mpatches.Patch(color='red', label='Test')
            blue_patch = mpatches.Patch(color='blue', label='Train')

            plt.legend(handles=[red_patch, blue_patch], loc=4)

        if 'loss' in self.graphs:
            count_subplots += 1
            plt.subplot(self.num_subplots, 1, count_subplots)
            plt.title('Loss')
            #plt.axis([0,100,0,5])
            plt.plot(epochs, self.val_loss, color='r')
            plt.plot(epochs, self.loss, color='b')
            plt.ylabel('loss')

            red_patch = mpatches.Patch(color='red', label='Test')
            blue_patch = mpatches.Patch(color='blue', label='Train')

            plt.legend(handles=[red_patch, blue_patch], loc=4)
        
        plt.draw()
        plt.pause(0.001)

    def on_train_end(self, logs={}):
        if self.save_graph:
            plt.savefig('training_acc_loss.png')
plot_losses = AccLossPlotter()

In [26]:
def create_network(neurons=1):
    model = Sequential()
    model.add(Bidirectional(LSTM(10,input_dim=(none,13)))
    model.add(Dense(30, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    adam = Adam(lr=0.01, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model
neural_network = KerasClassifier(build_fn=create_network, 
                                 epochs=100, 
                                 batch_size=10, 
                                 verbose=0)
neurons = [25, 28, 30, 64, 70]
param_grid = dict(neurons=neurons)
grid=GridSearchCV(estimator=neural_network, param_grid=param_grid, n_jobs=-1,scoring='roc_auc')
grid_result = grid.fit(x_train, y_train)
#filepath="weights.best.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#callbacks_list = [checkpoint,plot_losses]
#history=model.fit(x_train, y_train, batch_size=16, epochs=10,validation_split=0.1,callbacks=callbacks_list)
#score = model.evaluate(x_test, y_test, batch_size=16)
#y_pred=model.predict(x_test,batch_size=16)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [92]:
hist=np.array(history.history['loss'])
plt.plot(hist)
tokenizer.word_index

{'m': 1,
 'n': 2,
 'd': 3,
 '_': 4,
 'l': 5,
 'i': 6,
 'e': 7,
 's': 8,
 'p': 9,
 'r': 10,
 'a': 11,
 'y': 12,
 'g': 13,
 'v': 14,
 'k': 15,
 'f': 16,
 'q': 17,
 't': 18,
 'h': 19,
 'w': 20,
 'c': 21}

In [27]:
print(score)
plot_model(model, to_file='model.png',show_shapes=True)

[0.383246162864897, 0.84]


In [16]:
y_pred=np.argmax(model.predict(x_test,batch_size=16),axis=1)
y_testp=np.argmax(y_test,axis=1)
tn, fp, fn, tp=confusion_matrix(y_testp,y_pred).ravel()
print(tn, fp, fn, tp)

169 13 13 30


In [23]:
data2=pd.read_csv("/Volumes/Maxtor/firstrain.csv",delimiter=",")
init=data['peptides'].values
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(init)
x= np.asarray(tokenizer.texts_to_sequences(init))
xo=x
y=np.asarray(data['NB'])
x=to_categorical(x,num_classes=22)
y=to_categorical(y,num_classes=2)
y_pred=np.argmax(model.predict(x,batch_size=16),axis=1)
y_testp=np.argmax(y,axis=1)
tn, fp, fn, tp=confusion_matrix(y_testp,y_pred).ravel()
print(tn, fp, fn, tp)
print((tp+tn)/(fp+fn+tp+tn))

1527 9 1 1535
0.9967447916666666


In [28]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.831571 using {'neurons': 30}


In [None]:
model = Sequential()
#model.add(Embedding(20, output_dim=128,input_length=9))
model.add(Bidirectional(LSTM(100,input_shape=(9,22))))
model.add(Dense(64, input_dim=128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
adam = Adam(lr=0.01, epsilon=None, decay=0.0, amsgrad=False)
BATCH_SIZE=40
def datGen():
    while True:
        batchX=np.zeros(((BATCH_SIZE,)+x_train.shape[1:] ))
        batchY= np.zeros( (BATCH_SIZE,2 ))
        nElems=0
        for i in range(x_train.shape[0]):
          batchX[nElems]= x_train[i]
          batchY[nElems]= y_train[i]
          nElems+=1
          if nElems==BATCH_SIZE:
            yield (batchX, batchY)
            nElems=0
valdata=next(datGen())
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])
cbk = CollectOutputAndTarget()
fetches = [tf.assign(cbk.var_y_true, model.targets[0], validate_shape=False),
           tf.assign(cbk.var_y_pred, model.outputs[0], validate_shape=False)]
model._function_kwargs = {'fetches': fetches}
#history=model.fit_generator(datGen(), steps_per_epoch=1,
          #epochs=100,callbacks=[cbk],validation_data=valdata)
history=model.fit(valdata[0],valdata[1], batch_size=BATCH_SIZE, epochs=100,validation_data=valdata,callbacks=[cbk])
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)

In [2]:
data=pd.read_csv("/Volumes/Maxtor/firstrain.csv",delimiter=",")
data = data.drop_duplicates(subset='peptides').reset_index(drop=True)
init=data['peptides'].values
c=0
for word in init:
    init[c]=word
    c+=1
x=np.asarray(init)
y=np.asarray(data['NB'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
trains=pd.DataFrame(data={'peptides':x_train, 'NB': y_train})
test=pd.DataFrame(data={'peptides':x_test, 'NB': y_test})
trains.to_csv("/Volumes/Maxtor/firsttraintrain5.csv",index=False)
test.to_csv("/Volumes/Maxtor/testproteins5.csv",index=False)

In [15]:
print(len(init[0]))

12
