### Importing files

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/mlba/train.csv")
df_train.head()

Unnamed: 0,Sequence,Label
0,MTAELRNLPHIASMAFNEPLMLEPAYARVFFCALAGQLGISSLTDA...,1
1,MTDITANVVVSNPRPIFTESRSFKAVANGKIYIGQIDTDPVNPANQ...,1
2,MEMISNNLNWFVGVVEDRMDPLKLGRVRVRVVGLHPPQRAQGDVMG...,1
3,MIINLADVEQLSIKAESVDFQYDMYKKVCEKFTDFEQSVLWQCMEA...,1
4,MRAFSTLDRENETFVPSVRVYADGETEDNSFSLKYRSNWTPGRFNS...,1


### Pre-processing Data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(df_train["Sequence"], df_train["Label"].values, shuffle = True)

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

integer_encoder = LabelEncoder()
integer_encoder.fit(list('ACDEFGHIKLMNPQRSTVWY'))
X_integer_encoded = [integer_encoder.transform(list(seq)) for seq in x_train]
X_val_encoded = [integer_encoder.transform(list(seq)) for seq in x_val]

x_integer_encoded_pad = tf.keras.preprocessing.sequence.pad_sequences(X_integer_encoded)
x_val_encoded_pad = tf.keras.preprocessing.sequence.pad_sequences(X_val_encoded)

In [None]:
def pad_enc(x):
  integer_encoder = LabelEncoder()
  integer_encoder.fit(list('ACDEFGHIKLMNPQRSTVWY'))
  x = [integer_encoder.transform(list(seq)) for seq in x]
  x = tf.keras.preprocessing.sequence.pad_sequences(x)
  return x

### Making CNN

In [None]:
vocab_size = 21

In [None]:
# Variable-length int sequences.
Input = tf.keras.Input(shape=(None, ), dtype='float')
# Embedding lookup.
token_embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=16)
# Query embeddings of shape [batch_size, Tq, dimension].
query_embeddings = token_embedding(Input)
# Value embeddings of shape [batch_size, Tv, dimension].
value_embeddings = token_embedding(Input)

# CNN layer.
cnn_layer = tf.keras.layers.Conv1D(
    filters=100,
    kernel_size=2,
    # Use 'same' padding so outputs have the same shape as inputs.
    padding='same')
# Query encoding of shape [batch_size, Tq, filters].
query_seq_encoding = cnn_layer(query_embeddings)
# Value encoding of shape [batch_size, Tv, filters].
value_seq_encoding = cnn_layer(value_embeddings)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.Attention()(
    [query_seq_encoding, value_seq_encoding])

# Reduce over the sequence axis to produce encodings of shape
# [batch_size, filters].
query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    query_seq_encoding)
query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    query_value_attention_seq)

# Concatenate query and document encodings to produce a DNN input layer.
input_layer = tf.keras.layers.Concatenate()(
    [query_encoding, query_value_attention])

x = tf.keras.layers.Dense(128, activation = "relu")(input_layer)
x = tf.keras.layers.Dense(64, activation = "relu")(x)
x = tf.keras.layers.Dense(32, activation = "relu")(x)
x = tf.keras.layers.Dense(16, activation = "relu")(x)
output = tf.keras.layers.Dense(1, activation = "sigmoid") (x)

model = tf.keras.Model(inputs= Input, outputs = output)
model.compile(optimizer = "adam", metrics = ["accuracy"], loss = "binary_crossentropy")
model.summary()


Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, None)]               0         []                            
                                                                                                  
 embedding_13 (Embedding)    (None, None, 16)             320       ['input_13[0][0]',            
                                                                     'input_13[0][0]']            
                                                                                                  
 conv1d_39 (Conv1D)          (None, None, 100)            3300      ['embedding_13[0][0]',        
                                                                     'embedding_13[1][0]']        
                                                                                           

### Training the model

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor = "val_loss")
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

In [None]:
model.fit(x_integer_encoded_pad, y_train, validation_data= (x_val_encoded_pad, y_val), callbacks = [early_stopping, reduce_lr], epochs = 60)

### Refitting the model on the entire dataset after reinitializing

In [None]:
x = pad_enc(df_train["Sequence"])

In [None]:
x

array([[ 0,  0,  0, ..., 16, 12, 17],
       [ 0,  0,  0, ...,  0, 16,  9],
       [ 0,  0,  0, ...,  2,  7,  5],
       ...,
       [ 0,  0,  0, ...,  5,  0,  3],
       [ 0,  0,  0, ...,  2,  2,  5],
       [ 0,  0,  0, ...,  2,  2,  3]], dtype=int32)

In [None]:
model.fit(x, df_train["Label"].values, epochs = 40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7ba716e25900>

### Getting test predictions

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/mlba/test.csv")
df_test.head()

Unnamed: 0,ID,Sequence
0,501,ASNFTQFVLVNDGGTGNVTVAPSNFANGVAEWISSNSRSQAYKVTC...
1,502,ASNFTQFVLVNDGGTGNVTVAPSNFANGVAEWISSNSRSQAYKVTC...
2,503,MAATHTLPLASPGMARICLYGDLQRFGRRIDLRVKTGAEAIRALAT...
3,504,MADLKVGSTTGGSVIWHQGNFPLNPAGDDVLYKSFKIYSEYNKPQA...
4,505,MADLKVGSTTGGSVIWHQGNFPLNPAGDDVLYKSFKIYSEYNKPQA...


In [None]:
x_test = df_test["Sequence"]
integer_encoder = LabelEncoder()
integer_encoder.fit(list('ACDEFGHIKLMNPQRSTVWY'))
X_test_encoded = [integer_encoder.transform(list(seq)) for seq in x_test]

x_test_encoded_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_encoded)

In [None]:
x_test_encoded_pad.shape

(310, 1289)

In [None]:
preds = model.predict(x_test_encoded_pad).flatten()



In [None]:
preds = [i[1] for i in y_pred]
preds = np.array(preds)

### Saving as CSV

In [None]:
final = {"ID":df_test["ID"].values, "Label": preds}
final_df = pd.DataFrame(final)
final_df.head()

In [None]:
final_df.to_csv("/content/drive/MyDrive/mlba/gruop80_predictions_output.csv", index = False)