In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, Callback
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr

# This step loads the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Wrote a function to encode the AA sequences into integers
#It assigns each amino acid a unique integer.Essential for transforming categorical data to numerical form ML models 
#can process.
def encode_sequences(sequences):
    amino_acids = 'ARNDCQEGHILKMFPSTWYVX'
    aa_dict = {aa: idx + 1 for idx, aa in enumerate(amino_acids)}
    encoded = [[aa_dict.get(aa, 0) for aa in sequence] for sequence in sequences]
    return pad_sequences(encoded, maxlen=1000)

# Prepared the training and testing data
X_train = encode_sequences(train_data['sequence'])
y_train = train_data['target'].values
X_test = encode_sequences(test_data['sequence'])

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model architecture: Bidirectional LSTMs with a custom attention mechanism.
#Inspired from https://github.com/kjs-design/CNN-BiLSTM-Attention-model/blob/master/Model/BiLSTM_Attention.py
#Changes made: Used Tensorflow instead of Pytorch and simplified the attention mechanism by using a Dense layer for 
#transforming LSTM outputs into attention weights and then applying these weights to the LSTM outputs.
#Used matrix operations directly to compute attention over the LSTM outputs.
def LSTM_model():
    inputs = Input(shape=(1000,))
    x = Embedding(input_dim=21 + 1, output_dim=128)(inputs) # Embedding layer
    x = Bidirectional(LSTM(256, return_sequences=True))(x)  # First LSTM layer
    x = Bidirectional(LSTM(int(256/2), return_sequences=True))(x) #Second LSTM layer
    
    # Simplified custom attention layer
    attention_data = Dense(1, activation='tanh')(x)
    attention_weights = tf.nn.softmax(attention_data, axis=1)
    context_vector = attention_weights * x
    context_vector = tf.reduce_sum(context_vector, axis=1)
    
    x = BatchNormalization()(context_vector) # Normalizing the batch data
    x = Dense(128, activation='relu')(x)  # Dense layer for further processing
    x = Dropout(0.5)(x) # Dropout to prevent overfitting
    outputs = Dense(1)(x) # Output layer
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    return model

model = LSTM_model()
model.summary()

# wrote a function for Callback to monitor Spearman correlation during training
#Inspired from ChatGPT: https://chat.openai.com/share/ffb9e194-45f4-46b7-ba39-1d9bfec04caf
class SpearmanCorrelation(Callback):
    def on_epoch_end(self, epoch, logs=None):
        preds = model.predict(X_val)
        spearman_score, _ = spearmanr(y_val, preds)
        print(f"\nSpearman Correlation at epoch {epoch + 1}: {spearman_score:.4f}")
#Wrote a function for Learning rate scheduler to decrease the learning rate as training progresses
def adjust_learning_rate(epoch, lr):
    if epoch < 10:
        return lr
    return lr * tf.math.exp(-0.1)

checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True) # Checkpoint to save the best model
lr_scheduler = LearningRateScheduler(adjust_learning_rate) # Scheduler for learning rate adjustment

# Training the model
model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_val, y_val), 
          callbacks=[SpearmanCorrelation(), checkpoint, lr_scheduler])

# Loading the best model saved during training
model = load_model('best_model.h5')

# Making predictions on the test set
predictions = model.predict(X_test)

# Saving the predictions to a CSV file
def save_predictions(test_ids, predictions):
    pd.DataFrame({'id': test_ids, 'target': predictions.flatten()}).to_csv('prediction.csv', index=False)

save_predictions(test_data['id'], predictions)


2024-04-12 10:37:28.754283: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-12 10:37:43.108565: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-12 10:37:44.252292: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21324 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:3b:00.0, compute capability: 7.5
2024-04-12 10:37:44.253012: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Crea

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1000, 128)    2816        ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 1000, 512)    788480      ['embedding[0][0]']              
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 1000, 256)   656384      ['bidirectional[0][0]']          
 )                                                                                            

2024-04-12 10:37:51.868008: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100



Spearman Correlation at epoch 1: 0.3242
Epoch 2/100

Spearman Correlation at epoch 2: 0.0119
Epoch 3/100

Spearman Correlation at epoch 3: -0.0873
Epoch 4/100

Spearman Correlation at epoch 4: 0.3644
Epoch 5/100

Spearman Correlation at epoch 5: 0.3839
Epoch 6/100

Spearman Correlation at epoch 6: 0.4869
Epoch 7/100

Spearman Correlation at epoch 7: 0.4711
Epoch 8/100

Spearman Correlation at epoch 8: 0.4882
Epoch 9/100

Spearman Correlation at epoch 9: 0.4081
Epoch 10/100

Spearman Correlation at epoch 10: 0.4731
Epoch 11/100

Spearman Correlation at epoch 11: 0.0967
Epoch 12/100

Spearman Correlation at epoch 12: 0.4767
Epoch 13/100

Spearman Correlation at epoch 13: 0.5021
Epoch 14/100

Spearman Correlation at epoch 14: 0.5078
Epoch 15/100

Spearman Correlation at epoch 15: 0.5239
Epoch 16/100

Spearman Correlation at epoch 16: -0.0774
Epoch 17/100

Spearman Correlation at epoch 17: 0.3920
Epoch 18/100

Spearman Correlation at epoch 18: 0.4903
Epoch 19/100

Spearman Correlation at 