In [97]:
# Data : https://github.com/ditekunov/mhc-peptides-dataset


import numpy as np
import pandas as pd

In [129]:
df1=pd.read_csv('mhc_train.csv')
df2=pd.read_csv('mhc_test.csv')

In [104]:
import numpy as np
import re
def string_to_array(my_string):
    my_string = my_string.lower()
    my_string = re.sub('[^arndcqeghilkmfpstwyvx]', 'z', my_string)
    my_array = np.array(list(my_string))
    return my_array

# create a label encoder with 'acgtn' alphabet
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','r','n','d','c','q','e','g','h','i','l','k','m','f','p','s','t','w','y','v','x','z']))

def ordinal_encoder(my_array):
    integer_encoded = label_encoder.transform(my_array)
    float_encoded = integer_encoded.astype(float)
    float_encoded[float_encoded == 0] = 1.25 # A
    float_encoded[float_encoded == 1] = 2.50 # R
    float_encoded[float_encoded == 2] = 3.75 # N
    float_encoded[float_encoded == 3] = 4.30 # D
    float_encoded[float_encoded == 4] = 5.45 # C
    float_encoded[float_encoded == 5] = 6.35 # Q
    float_encoded[float_encoded == 6] = 7.85 # E 
    float_encoded[float_encoded == 7] = 8.65 # G
    float_encoded[float_encoded == 8] = 9.95 # H
    float_encoded[float_encoded == 9] = 11.25 # I
    float_encoded[float_encoded == 10] = 12.55 # L
    float_encoded[float_encoded == 11] = 13.15 # K
    float_encoded[float_encoded == 12] = 14.11 # L
    float_encoded[float_encoded == 13] = 15.29 # K
    float_encoded[float_encoded == 14] = 16.39 # M
    float_encoded[float_encoded == 15] = 17.05 # F
    float_encoded[float_encoded == 16] = 18.09 # P
    float_encoded[float_encoded == 17] = 19.49 # S
    float_encoded[float_encoded == 18] = 20.79 # T
    float_encoded[float_encoded == 19] = 21.19 # W
    float_encoded[float_encoded == 20] = 22.55 # Y
    float_encoded[float_encoded == 21] = 23.95 # V
    float_encoded[float_encoded == 22] = 24.45 # X
    float_encoded[float_encoded == 10] = 0.0 # anything else z
    

    return float_encoded

In [118]:
X_train=[ordinal_encoder(string_to_array(i)) for i in df1['sequence'][0:72852]]
X_train=np.array(np.array(X_train).reshape(72852,9,1))

X_test=[ordinal_encoder(string_to_array(i)) for i in df2['sequence'][0:20425
                                                                    ]]
X_test=np.array(np.array(X_test).reshape(20425,9,1))

In [119]:
import math
y_train=df1['pep_class'][0:72852]
y_test=df2['pep_class'][0:72852]

y_train=[int(i) for i in y_train]
y_test=[int(i) for i in y_test]

y_train2 = keras.utils.to_categorical(y_train)
y_test2 = keras.utils.to_categorical(y_test)

In [120]:
import tensorflow
import keras

In [121]:
import keras

NUM_CLASSES = 2

# import necessary building blocks
from keras.models import Sequential
from keras.layers import Conv1D,  Flatten, Dense, Activation, Dropout,BatchNormalization,LSTM, MaxPool1D
from keras.layers.advanced_activations import LeakyReLU

In [122]:
def make_model():
    """
    Define your model architecture here.
    Returns `Sequential` model.
    """
    model = Sequential()
    
    model.add(Conv1D(input_shape=X_train[0].shape,padding="same",kernel_size=3,filters=16))
    model.add(MaxPool1D())
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    
    
    
    model.add(Conv1D(padding="same",kernel_size=3,filters=32))
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    

    model.add(Dropout(0.25))
    model.add(Conv1D(padding="same",kernel_size=3,filters=32))
    model.add(MaxPool1D())
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    
    model.add(Conv1D(padding="same",kernel_size=3,filters=64))
    model.add(MaxPool1D())
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    
    model.add(Dropout(0.25))
    model.add(Flatten())
    
    model.add(Dense(256))
    model.add(LeakyReLU(0.1))
    model.add(Dropout(0.5))
    model.add(Dense(2))
    model.add(LeakyReLU(0.1))
    
    model.add(Activation("softmax"))
    
    return model


In [123]:
model = make_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_46 (Conv1D)           (None, 9, 16)             64        
_________________________________________________________________
max_pooling1d_34 (MaxPooling (None, 4, 16)             0         
_________________________________________________________________
leaky_re_lu_67 (LeakyReLU)   (None, 4, 16)             0         
_________________________________________________________________
batch_normalization_45 (Batc (None, 4, 16)             64        
_________________________________________________________________
conv1d_47 (Conv1D)           (None, 4, 32)             1568      
_________________________________________________________________
leaky_re_lu_68 (LeakyReLU)   (None, 4, 32)             0         
_________________________________________________________________
batch_normalization_46 (Batc (None, 4, 32)             128       
__________

In [124]:
from keras import backend as K
INIT_LR = 5e-3  # initial learning rate
BATCH_SIZE = 32
EPOCHS = 100


# don't call K.set_learning_phase() !!! (otherwise will enable dropout in train/test simultaneously)
model = make_model()  # define our model

# prepare model for fitting (loss, optimizer, etc)
model.compile(
    loss='categorical_crossentropy',  # we train 10-way classification
    optimizer=keras.optimizers.adamax(lr=INIT_LR),  # for SGD
    metrics=['accuracy']  # report accuracy during training
)

# scheduler of learning rate (decay with epochs)
def lr_scheduler(epoch):
    return INIT_LR * 0.9 ** epoch

# callback for printing of actual learning rate used by optimizer
class LrHistory(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs={}):
        print("Learning rate:", K.get_value(model.optimizer.lr))

# fit model
model.fit(
    X_train, y_train2,  # prepared data
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[keras.callbacks.LearningRateScheduler(lr_scheduler), LrHistory()],
    validation_data=(X_test, y_test2),
    shuffle=True,
    verbose=0
)

Learning rate: 0.005
Learning rate: 0.0045
Learning rate: 0.00405
Learning rate: 0.003645
Learning rate: 0.0032805
Learning rate: 0.00295245
Learning rate: 0.002657205
Learning rate: 0.0023914846
Learning rate: 0.002152336
Learning rate: 0.0019371024
Learning rate: 0.0017433922
Learning rate: 0.0015690529
Learning rate: 0.0014121477
Learning rate: 0.001270933
Learning rate: 0.0011438397
Learning rate: 0.0010294557
Learning rate: 0.0009265101
Learning rate: 0.0008338591
Learning rate: 0.0007504732
Learning rate: 0.00067542586
Learning rate: 0.00060788327
Learning rate: 0.00054709497
Learning rate: 0.0004923855
Learning rate: 0.0004431469
Learning rate: 0.00039883223
Learning rate: 0.000358949
Learning rate: 0.0003230541
Learning rate: 0.0002907487
Learning rate: 0.00026167382
Learning rate: 0.00023550644
Learning rate: 0.00021195579
Learning rate: 0.00019076021
Learning rate: 0.0001716842
Learning rate: 0.00015451577
Learning rate: 0.0001390642
Learning rate: 0.00012515778
Learning rate

KeyboardInterrupt: 

In [125]:
test_predictions = model.predict_proba(X_test).argmax(axis=-1)




In [126]:
test_answers = y_test2.argmax(axis=-1)


In [127]:
test_accuracy = np.mean(test_predictions==test_answers)


In [128]:
print(str(test_accuracy*100)+"%")


71.10403916768666%
