In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [2]:
dic_aa2int = {'A' : 1,
              'R' : 2,
              'N' : 3,
              'D' : 4,
              'C' : 5,
              'Q' : 6,
              'E' : 7,
              'G' : 8,
              'H' : 9,
              'I' : 10,
              'L' : 11,
              'K' : 12,
              'M' : 13,
              'F' : 14,
              'P' : 15,
              'S' : 16,
              'T' : 17,
              'W' : 18,
              'Y' : 19,
              'V' : 20,
              'X' : 0,
              '-' : 0,
              '*' : 0,
              '?' : 0}

def aa2int(seq : str) -> list:
    return [dic_aa2int[i] for i in seq]

def aa2onehot(list_of_sequences, chain_type = None):
    if chain_type == 'heavyChain':
        seq_len = 150
    elif chain_type == 'lightChain':
        seq_len = 130
    else:
        print('Problem with chain type...')
        return
    
    n_amino = 20
    onehot_data = np.zeros((len(list_of_sequences), seq_len, n_amino))
    for index, seq in enumerate(list_of_sequences):  
        output = np.zeros((seq_len, n_amino))
        c = 0
        for i in aa2int(seq):
            temp = np.zeros((n_amino))
            if i == 0:
                output[c] = temp
            else:
                temp[i-1] = 1
                output[c] = temp
            c = c+1
        
        onehot_data[index] = output
    return onehot_data

In [3]:
df = pd.read_csv('HighLow_dataset.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.head(2)

Unnamed: 0,sequence_alignment_aa_heavy,sequence_alignment_aa_light,target
0,QIQLVQSGPELKKPGETVKISCKASGYTFTTYGMSWVKQAPGKGLK...,DVLMTQTPLSLPVSLGDQASISCRSSQSIVHSNGNTYLEWYLQKPG...,1
1,QVQLQQSGAELARPGASVKLSCKASGYTFTSYGISWVKQRTGQGLE...,DIVMTQSHKFMSTSVGDRVSITCKASQDVGTAVAWYQQKPGQSPKL...,1


In [4]:
heavy_chain = df.iloc[:,0]
light_chain = df.iloc[:,1]
labels = df.iloc[:,2]

In [5]:
fake_heavy_chain = []
fake_labels = np.zeros(len(labels))
sim = 'X'*10

for sequence in heavy_chain:
    handler = sequence[10:]
    handler = sim+handler
    fake_heavy_chain.append(handler)
    
fake_heavy_chain = fake_heavy_chain

In [6]:
onehot_heavy = aa2onehot(heavy_chain, chain_type = 'heavyChain')
onehot_light = aa2onehot(light_chain, chain_type = 'lightChain')
onehot_heavy_fake = aa2onehot(fake_heavy_chain, chain_type = 'heavyChain')

In [None]:
Y_data = np.concatenate((labels, fake_labels))
Hchain_train, Hchain_test, Lchain_train, Lchain_test, Y_train, Y_test = train_test_split(onehot_heavy,
                                                                                        onehot_light,
                                                                                         
                                                                                        Y_data,
                                                                                        test_size = 0.2,
                                                                                        shuffle = True,
                                                                                        stratify = Y_data,
                                                                                        random_state = 11
                                                                                        )