In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [3]:
dic_aa2int = {'A' : 1,
              'R' : 2,
              'N' : 3,
              'D' : 4,
              'C' : 5,
              'Q' : 6,
              'E' : 7,
              'G' : 8,
              'H' : 9,
              'I' : 10,
              'L' : 11,
              'K' : 12,
              'M' : 13,
              'F' : 14,
              'P' : 15,
              'S' : 16,
              'T' : 17,
              'W' : 18,
              'Y' : 19,
              'V' : 20,
              'X' : 0,
              '-' : 0,
              '*' : 0,
              '?' : 0}

def aa2int(seq : str) -> list:
    return [dic_aa2int[i] for i in seq]

def aa2onehot(list_of_sequences, chain_type = None):
    if chain_type == 'heavyChain':
        seq_len = 150
    elif chain_type == 'lightChain':
        seq_len = 130
    else:
        print('Problem with chain type...')
        return
    
    n_amino = 20
    onehot_data = np.zeros((len(list_of_sequences), seq_len, n_amino))
    for index, seq in enumerate(list_of_sequences):  
        output = np.zeros((seq_len, n_amino))
        c = 0
        for i in aa2int(seq):
            temp = np.zeros((n_amino))
            if i == 0:
                output[c] = temp
            else:
                temp[i-1] = 1
                output[c] = temp
            c = c+1
        
        onehot_data[index] = output
    onehot_data_reshape = np.reshape(onehot_data, (onehot_data.shape[0], onehot_data.shape[1]*onehot_data.shape[2]))
    return onehot_data_reshape

In [4]:
df = pd.read_csv('HighLow_dataset.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.head(2)

Unnamed: 0,sequence_alignment_aa_heavy,sequence_alignment_aa_light,target
0,QIQLVQSGPELKKPGETVKISCKASGYTFTTYGMSWVKQAPGKGLK...,DVLMTQTPLSLPVSLGDQASISCRSSQSIVHSNGNTYLEWYLQKPG...,1
1,QVQLQQSGAELARPGASVKLSCKASGYTFTSYGISWVKQRTGQGLE...,DIVMTQSHKFMSTSVGDRVSITCKASQDVGTAVAWYQQKPGQSPKL...,1


In [5]:
heavy_chain = df.iloc[:,0]
light_chain = df.iloc[:,1]
labels = df.iloc[:,2]

In [6]:
fake_heavy_chain = []
fake_labels = np.zeros(len(labels))
sim = 'X'*10

for sequence in heavy_chain:
    handler = sequence[10:]
    handler = sim+handler
    fake_heavy_chain.append(handler)
    
fake_heavy_chain = fake_heavy_chain

In [7]:
onehot_heavy = aa2onehot(heavy_chain, chain_type = 'heavyChain')
onehot_light = aa2onehot(light_chain, chain_type = 'lightChain')
onehot_heavy_fake = aa2onehot(fake_heavy_chain, chain_type = 'heavyChain')

In [8]:
onehot_heavy_total = np.concatenate((onehot_heavy, onehot_heavy_fake))
onehot_light_total = np.concatenate((onehot_light, onehot_light))
Y_data = np.concatenate((labels, fake_labels))

print(onehot_heavy_total.shape)
print(onehot_light_total.shape)
print(Y_data.shape)

(243676, 3000)
(243676, 2600)
(243676, 2)


In [9]:
train_heavy, test_heavy, train_light, test_light, Y_train, Y_test = train_test_split(onehot_heavy_total,
                                                                                     onehot_light_total,
                                                                                     Y_data,
                                                                                     test_size = 0.2,
                                                                                     shuffle = True,
                                                                                     random_state = 11)

Y_hot_train = to_categorical(Y_train, num_classes = 2)
Y_hot_test = to_categorical(Y_test, num_classes = 2)

print(train_heavy.shape)
print(test_heavy.shape)
print(train_light.shape)
print(test_light.shape)
print(Y_hot_train.shape)
print(Y_hot_test.shape)

(194940, 3000)
(48736, 3000)
(194940, 2600)
(48736, 2600)
(194940,)
(48736,)


In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Concatenate

def aminoAcid_model(heavy_chain, light_chain, num_classes, label_smoothing = 0.05):
    X_input1 = tf.keras.Input(heavy_chain, name = 'Heavy_chain_data')
    dense1 = Dense(256, activation = 'relu')(X_input1)
    
    X_input2 = tf.keras.Input(light_chain, name = 'Light_chain_data')
    dense2 = Dense(256, activation = 'relu')(X_input2)
    
    merge = Concatenate()([X_input1, X_input2])
    dense3 = Dense(256, activation = 'relu')(merge)
    X_output = Dense(num_classes, activation = 'softmax', name = 'Softmax_layer')(dense3)
                   
    model = Model(inputs = [X_input1, X_input2], outputs = X_output, name = 'CNN_aminoAcid_model')
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing),
                  metrics = 'accuracy')
    return model

model = aminoAcid_model(train_heavy.shape[1:], train_light.shape[1:], 2)
training = model.fit(x = [train_heavy, train_light], y = Y_hot_train, batch_size = 16, epochs = 50, 
                     validation_split = 0.2, shuffle = True, verbose = True)

Epoch 1/50

MemoryError: Unable to allocate 387. MiB for an array with shape (38988, 2600) and data type float32

In [None]:
testing = model.evaluate(x = [test_heavy, test_light], y = Y_hot_test, verbose = 1)