In [40]:
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import pandas as pd
import keras
import tensorflow as tf
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, BatchNormalization
from keras.optimizers import SGD, Adam
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, Sequence
from sklearn.utils import resample,class_weight, shuffle
from keras.layers import Bidirectional
from IPython.display import clear_output
import matplotlib.patches as mpatches
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import max_norm
from scipy.sparse import issparse
import types
import copy

In [2]:
#Statistical analysis of the physical properties of the 20 naturally occurring amino acids
def AminoAcidsEmb(string):
    AA_as10Factor= {
      'A' :[-1.56 ,-1.67 ,-0.97 ,-0.27 ,-0.93 ,-0.78 ,-0.20 ,-0.08 ,0.21 ,-0.48 ],
      'R' :[0.22 ,1.27 ,1.37 ,1.87 ,-1.70 ,0.46 ,0.92 ,-0.39 ,0.23 ,0.93 ],
      'N' :[1.14 ,-0.07 ,-0.12 ,0.81 ,0.18 ,0.37 ,-0.09 ,1.23 ,1.10 ,-1.73 ],
      'D' :[0.58 ,-0.22 ,-1.58 ,0.81 ,-0.92 ,0.15 ,-1.52 ,0.47 ,0.76 ,0.70 ],
      'C' :[0.12 ,-0.89 ,0.45 ,-1.05 ,-0.71 ,2.41 ,1.52 ,-0.69 ,1.13 ,1.10 ],
      'Q' :[-0.47 ,0.24 ,0.07 ,1.10 ,1.10 ,0.59 ,0.84 ,-0.71 ,-0.03 ,-2.33 ],
      'E' :[-1.45 ,0.19 ,-1.61 ,1.17 ,-1.31 ,0.40 ,0.04 ,0.38 ,-0.35 ,-0.12 ],
      'G' :[1.46 ,-1.96 ,-0.23 ,-0.16 ,0.10 ,-0.11 ,1.32 ,2.36 ,-1.66 ,0.46 ],
      'H' :[-0.41 ,0.52 ,-0.28 ,0.28 ,1.61 ,1.01 ,-1.85 ,0.47 ,1.13 ,1.63 ],
      'I' :[-0.73 ,-0.16 ,1.79 ,-0.77 ,-0.54 ,0.03 ,-0.83 ,0.51 ,0.66 ,-1.78 ],
      'L' :[-1.04 ,0.00 ,-0.24 ,-1.10 ,-0.55 ,-2.05 ,0.96 ,-0.76 ,0.45 ,0.93 ],
      'K' :[-0.34 ,0.82 ,-0.23 ,1.70 ,1.54 ,-1.62 ,1.15 ,-0.08 ,-0.48 ,0.60 ],
      'M' :[-1.40 ,0.18 ,-0.42 ,-0.73 ,2.00 ,1.52 ,0.26 ,0.11 ,-1.27 ,0.27 ],
      'F' :[-0.21 ,0.98 ,-0.36 ,-1.43 ,0.22 ,-0.81 ,0.67 ,1.10 ,1.71 ,-0.44 ],
      'P' :[2.06 ,-0.33 ,-1.15 ,-0.75 ,0.88 ,-0.45 ,0.30 ,-2.30 ,0.74 ,-0.28 ],
      'S' :[0.81 ,-1.08 ,0.16 ,0.42 ,-0.21 ,-0.43 ,-1.89 ,-1.15 ,-0.97 ,-0.23 ],
      'T' :[0.26 ,-0.70 ,1.21 ,0.63 ,-0.10 ,0.21 ,0.24 ,-1.15 ,-0.56 ,0.19 ],
      'W' :[0.30 ,2.10 ,-0.72 ,-1.57 ,-1.16 ,0.57 ,-0.48 ,-0.40 ,-2.30 ,-0.60 ],
      'Y' :[1.38 ,1.48 ,0.80 ,-0.56 ,-0.00 ,-0.68 ,-0.31 ,1.03 ,-0.05 ,0.53 ],
      'V' :[-0.74 ,-0.71 ,2.04 ,-0.40 ,0.50 ,-0.81 ,-1.07 ,0.06 ,-0.46 ,0.65 ],
      'X' :[0.0]*10,
      '_' :[0.0]*10}
    ls=[]*10
    for item in string:
        if item in AA_as10Factor.keys():
            ls = ls+AA_as10Factor.get(item)
    embedding=np.reshape(np.array(ls),(-1,10))
    return embedding
#(size of protein, dimensions or 21)

def Strings2Embed(array):
    arr = []
    for n in range(array.shape[0]): 
        arr.append(AminoAcidsEmb(array[n]).T)
    arr=np.dstack(arr).T
    return arr

def BLOSUMSIM(string):
    #Obtain the BLOSUM62 Matrix
    string = list(string)
    if len(string)>1:
        num=np.random.randint(0,len(string)-1)
        char=string[num]
    else:
        char = string[0]
        num = 0
    with open("/Volumes/Maxtor/References/blosum62.txt") as matrix_file:
        matrix = matrix_file.read()
    lines = matrix.strip().split('\n')
    header = lines.pop(0)
    columns = header.split()
    matrix = {}
    for row in lines:
        entries = row.split()
        row_name = entries.pop(0)
        matrix[row_name] = {}
        if len(entries) != len(columns):
            raise Exception('Improper entry number in row')
        for column_name in columns:
            matrix[row_name][column_name] = int(entries.pop(0))
    #Retrieve the aa with highest similarity
    listaas = dict(map(reversed, matrix.get(char).items()))
    listprob = np.array(list(listaas.keys()))
    listprob = listprob[np.where(listprob>0)]
    prob = 1
    if listprob[1:].size > 0:
        nn = int(np.random.choice(listprob[1:],1))
        chrf = listaas.get(nn)
        prob = (np.exp(nn))/(sum(np.exp(listprob)))
    else:
        chrf = char
    string[num] = chrf
    string = "".join(string)
    return string, prob

In [10]:
class BalancedSequence(Sequence):
    """Balancing input classes with augmentation possibility and setting the balancing fraction
    """
    def __init__(self, X, y, batch_size, fracPos=0.5,val_split=0.1,isaug=False,naug=3,p_aug=0.5):
        self.X = X[0:int((1-val_split)*X.shape[0])]
        self.y = y[0:int((1-val_split)*y.shape[0])]
        self.batch_size = batch_size
        self.isaug = isaug
        self.naug = naug
        self.p_aug = p_aug
        self.pos_indices = np.where(self.y == 1)[0]
        self.neg_indices = np.where(self.y == 0)[0]
        self.X_Pos = self.X[self.pos_indices]
        self.n = min(len(self.pos_indices), len(self.neg_indices))
        if fracPos>(len(self.pos_indices)/(len(self.pos_indices)+len(self.neg_indices))):
            self.fracPos = fracPos
        else:
            self.fracPos = len(self.pos_indices)/(len(self.pos_indices)+len(self.neg_indices))
        self._index_array = None

    def __len__(self):
        # Reset batch after we are done with minority class.
        return int((self.n * (1/self.fracPos)) // self.batch_size)

    def on_epoch_end(self):
        # Reset batch after all minority indices are covered.
        self._index_array = None

    def __getitem__(self, batch_idx):
        if self._index_array is None:
            pos_indices = self.pos_indices.copy()
            neg_indices = self.neg_indices.copy()
            np.random.shuffle(pos_indices)
            np.random.shuffle(neg_indices)
            n_neg = int(np.floor(self.n*((1/self.fracPos)-1)))
            self._index_array = np.concatenate((pos_indices[:self.n], neg_indices[:n_neg]))
            np.random.shuffle(self._index_array)
        indices = self._index_array[batch_idx * self.batch_size: (batch_idx + 1) * self.batch_size]
        Xf = self.X[indices]
        Yf = self.y[indices]
        if self.isaug:
            for n in range(0,self.naug):
                new_pep, pp = BLOSUMSIM(self.X_Pos[np.random.randint(0,self.X_Pos.shape[0])])
                Xf = np.append(Xf,new_pep)
                if pp>=self.p_aug:
                    Yf = np.append(Yf,1)
                else:
                    Yf = np.append(Yf,0)
            indexx = np.arange(Xf.shape[0])
            np.random.shuffle(indexx)
            Xf = Xf[indexx]
            Yf = Yf[indexx]
        else:
            pass
        return Strings2Embed(Xf), Yf

class ValidationSet(Sequence):
    def __init__(self,X):
        self.X = X
        self.valToTake = int(X.shape[0])
    def __len__(self):
        return self.valToTake

    def __getitem__(self,batch_idx):
        return Strings2Embed(self.X)
    
    def on_epoch_end(self):
        pass

class CollectOutputAndTarget(Callback):
    def __init__(self):
        super(CollectOutputAndTarget, self).__init__()
        self.targets = []  # collect y_true batches
        self.outputs = []  # collect y_pred batches
        self.inputs= []

        # the shape of these 2 variables will change according to batch shape
        # to handle the "last batch", specify `validate_shape=False`
        self.var_y_true = tf.Variable(0., validate_shape=False)
        self.var_y_pred = tf.Variable(0., validate_shape=False)
        self.var_x = tf.Variable(0., validate_shape=False)

    def on_batch_end(self, batch, logs=None):
        # evaluate the variables and save them into lists
        self.targets.append(K.eval(self.var_y_true))
        self.outputs.append(K.eval(self.var_y_pred))
        if len(self.inputs)>1:
            print(np.array_equal(self.inputs[-1],self.validation_data[0]))
        self.inputs.append(self.validation_data[0])
        
class AccLossPlotter(Callback):
    """Plot training Accuracy and Loss values on a Matplotlib graph. 
    The graph is updated by the 'on_epoch_end' event of the Keras Callback class
    # Arguments
        graphs: list with some or all of ('acc', 'loss')
        save_graph: Save graph as an image on Keras Callback 'on_train_end' event 
    """

    def __init__(self, graphs=['acc', 'loss'], save_graph=True):
        self.graphs = graphs
        self.num_subplots = len(graphs)
        self.save_graph = save_graph


    def on_train_begin(self, logs={}):
        self.acc = []
        self.val_acc = []
        self.loss = []
        self.val_loss = []
        self.epoch_count = 0
        plt.ion()
        plt.show()


    def on_epoch_end(self, epoch, logs={}):
        self.epoch_count += 1
        self.val_acc.append(logs.get('val_acc'))
        self.acc.append(logs.get('acc'))
        self.loss.append(logs.get('loss'))
        self.val_loss.append(logs.get('val_loss'))
        epochs = [x for x in range(self.epoch_count)]

        count_subplots = 0
        
        if 'acc' in self.graphs:
            count_subplots += 1
            plt.subplot(self.num_subplots, 1, count_subplots)
            plt.title('Accuracy')
            plt.plot(epochs, self.val_acc, color='r')
            plt.plot(epochs, self.acc, color='b')
            plt.ylabel('accuracy')

            red_patch = mpatches.Patch(color='red', label='Test')
            blue_patch = mpatches.Patch(color='blue', label='Train')

            plt.legend(handles=[red_patch, blue_patch], loc=4)

        if 'loss' in self.graphs:
            count_subplots += 1
            plt.subplot(self.num_subplots, 1, count_subplots)
            plt.title('Loss')
            #plt.axis([0,100,0,5])
            plt.plot(epochs, self.val_loss, color='r')
            plt.plot(epochs, self.loss, color='b')
            plt.ylabel('loss')

            red_patch = mpatches.Patch(color='red', label='Test')
            blue_patch = mpatches.Patch(color='blue', label='Train')

            plt.legend(handles=[red_patch, blue_patch], loc=4)
        
        plt.draw()
        plt.pause(0.001)

    def on_train_end(self, logs={}):
        if self.save_graph:
            plt.savefig('training_acc_loss.png')
plot_losses = AccLossPlotter()

def create_network(neurons=10):
        model = Sequential()
        model.add(Bidirectional(LSTM(neurons),input_shape=(9,10)))
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))
        adam = Adam(lr=0.01, epsilon=None, decay=0.0, amsgrad=False)
        model.compile(loss='binary_crossentropy',
                      optimizer=adam,
                      metrics=['accuracy'])
        return model

In [58]:
class KerasClassifier(KerasClassifier):
    def fit(self, x, y, **kwargs):
        if self.build_fn is None:
            self.model = self.__call__(**self.filter_sk_params(self.__call__))
        elif (not isinstance(self.build_fn, types.FunctionType) and
              not isinstance(self.build_fn, types.MethodType)):
            self.model = self.build_fn(
                **self.filter_sk_params(self.build_fn.__call__))
        else:
            self.model = self.build_fn(**self.filter_sk_params(self.build_fn))

        loss_name = self.model.loss
        if hasattr(loss_name, '__name__'):
            loss_name = loss_name.__name__
        if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
            y = to_categorical(y)
        fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
        fit_args.update(kwargs)
        del fit_args['batch_size']
        #########################################################
        self.__history=self.model.fit_generator(BalancedSequence(x,y,self.sk_params["batch_size"],fracPos=0.4,val_split=0,isaug=False,naug=3,p_aug=0.5),
                                 samples_per_epoch=x.shape[0],**fit_args)
        return self.__history
    def predict_proba(self, x,**kwargs):
        x = Strings2Embed(x)
        kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
        probs = self.model.predict(x, **kwargs)
        # check if binary classification
        if probs.shape[1] == 1:
            # first column is probability of class 0 and second is of class 1
            probs = np.hstack([1 - probs, probs])
        return probs

In [59]:
data=pd.read_csv("/Volumes/Maxtor/firstrain.csv",delimiter=",")
init=data['peptides'].values
c=0
for word in init:
    init[c]=word[0:9]
    c+=1
x=np.asarray(init)
y=np.asarray(data['NB'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_test=Strings2Embed(x_test)
neural_network = KerasClassifier(build_fn=create_network, 
                                 epochs=100, 
                                 batch_size=10, 
                                 verbose=0)
neurons = [25, 28, 30, 64, 70]
param_grid = dict(neurons=neurons)
grid=GridSearchCV(estimator=neural_network, param_grid=param_grid, n_jobs=1,scoring='roc_auc')
grid_result = grid.fit(x_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



KeyboardInterrupt: 

In [47]:
x_train.shape

(899,)