In [1]:
import pandas as pd
import pickle
import keras
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import keras_tuner as kt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

2025-09-26 09:18:18.776210: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758878299.019586      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758878299.082073      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")

In [3]:
EPOCHS=20
BATCH_SIZE=64
PATIENCE=5

In [4]:
BASE_IN = '/kaggle/input/'
BASE_OUT = '/kaggle/working/'

# Load Embeddings & Inputs

In [5]:
with open(BASE_IN+'artifacts/embeddings_inputs.pkl', 'rb') as f:
    loaded_input_items = pickle.load(f)

In [6]:
embedding_matrix = loaded_input_items['embedding_matrix']
X_train_pad = loaded_input_items['X_train_pad']
X_val_pad = loaded_input_items['X_val_pad']
X_test_pad = loaded_input_items['X_test_pad']
y_train = loaded_input_items['y_train']
y_val = loaded_input_items['y_val']
y_test = loaded_input_items['y_test']

In [7]:
VOCAB_SIZE = embedding_matrix.shape[0]
EMBEDDING_DIM = embedding_matrix.shape[1]
MAX_LEN = len(X_train_pad[0])
print(f"Embedding dimension: {EMBEDDING_DIM}\nVocab size: {VOCAB_SIZE}\nMaximum input length: {MAX_LEN}")

Embedding dimension: 500
Vocab size: 35756
Maximum input length: 588


# LSTM

* Each LSTM layer (that performs repeated operations for several time steps) has 4 distinct, interacting computational components. Each of these components act like a fully-connected neural network layer.
* We have `forget gate` ($f_{t}$), `input gate` ($i_{t}$), and `output gate` ($o_{t}$), all of which are aggregations of the previous time step's hidden state and current time step's input with different weight matrices (and associated bias vectors). All these gates usually use sigmoid activation to output values between 0 and 1 indicating which information should be retained and which discarded.
* Then, we have a layer that applies tanh activation function on another aggregation (because it uses different weight matrix and bias vector), called the `candidate cell state` ($\tilde{C}_{t}$), which outputs vector of candidate values to add to the current time step's cell state.
* Then, we find the new cell state ($C_{t}$) by doing pointwise addition of pointwise multiplications of $f_{t}$ and $C_{t-1}$ and $i_{t}$ and $\tilde{C}_{t}$.
* Finally,we find the current time step's hidden state by doing pointwise mutliplication of $o_{t}$ and tanh($C_{t}$). 

## Trainable = False

In [8]:
model = Sequential()
model.add(Input(shape=(MAX_LEN,)))
model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    trainable=False))
# params => 35756*500 = 17878000
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
# params => ((500+128)*128+128)*4 = 322048
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()

I0000 00:00:1758878315.108981      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
estop = EarlyStopping(monitor='val_loss', mode='min', 
                      min_delta=1e-5, patience=PATIENCE,
                      restore_best_weights=True, verbose=1)
model.fit(X_train_pad, y_train,
          validation_data=(X_val_pad, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          callbacks=[estop], verbose=1)

Epoch 1/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 1s/step - accuracy: 0.5226 - loss: 0.6876 - val_accuracy: 0.5234 - val_loss: 0.6749
Epoch 2/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 1s/step - accuracy: 0.5274 - loss: 0.6653 - val_accuracy: 0.5514 - val_loss: 0.6521
Epoch 3/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 1s/step - accuracy: 0.6041 - loss: 0.6368 - val_accuracy: 0.7460 - val_loss: 0.5318
Epoch 4/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 1s/step - accuracy: 0.7070 - loss: 0.5646 - val_accuracy: 0.5195 - val_loss: 0.6806
Epoch 5/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 1s/step - accuracy: 0.5381 - loss: 0.6687 - val_accuracy: 0.7348 - val_loss: 0.5534
Epoch 6/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 1s/step - accuracy: 0.6883 - loss: 0.5998 - val_accuracy: 0.6588 - val_loss: 0.6317
Epoch 7/20
[1m206/206

<keras.src.callbacks.history.History at 0x7e4b2bfcbf90>

In [10]:
lstm_loss, lstm_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print("LSTM Test accuracy:", lstm_accuracy)
print("lSTM Test loss:", lstm_loss)

LSTM Test accuracy: 0.9552311301231384
lSTM Test loss: 0.11885205656290054


In [11]:
model.save(BASE_OUT+'simple_lstm.keras')

In [12]:
# def build_model(hp):
#     model = Sequential()
#     model.add(Input(shape=(MAX_LEN,)))
#     model.add(Embedding(input_dim=VOCAB_SIZE,
#                         output_dim=EMBEDDING_DIM,
#                         weights=[embedding_matrix],
#                         trainable=False))
#     model.add(LSTM(units=hp.Choice('units', values=[64, 128, 256]), 
#                    dropout=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1), 
#                    recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.2, max_value=0.5, step=0.1)))
    
#     model.add(Dense(1, activation='sigmoid'))
    
#     model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-4, 1e-3, 1e-2])),
#                                  loss='binary_crossentropy', metrics=['accuracy'])
#     return model

In [13]:
# tuner = kt.RandomSearch(
#     build_model,
#     objective='val_accuracy',
#     max_trials=25,
#     executions_per_trial=1,
#     directory=BASE_OUT+'tuner',
#     project_name='lstm_randomsearch'
# )

In [14]:
# tuner.search(X_train_pad, y_train,
#             validation_data=(X_val_pad, y_val),
#             epochs=EPOCHS, batch_size=BATCH_SIZE,
#             verbose=1)

In [15]:
# best_model1 = tuner.get_best_models(num_models=1)[0]
# best_hps1 = tuner.get_best_hyperparameters(num_trials=1)[0]
# print(best_hps1.values)

In [16]:
# best_loss1, best_accuracy1 = best_model1.evaluate(X_test_pad, y_test, verbose=0)
# print("Tuned LSTM (trainable=False) test accuracy:",best_accuracy1)
# print("Tuned LSTM (trainable=False) test loss:", best_loss1)

## Trainable = True

In [19]:
model = Sequential()
model.add(Input(shape=(MAX_LEN,)))
model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    trainable=True))
# params => 35756*500 = 17878000
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
# params => ((500+128)*128+128)*4 = 322048
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
# params => 128*1 + 1 = 129
model.summary()

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
estop = EarlyStopping(monitor='val_loss', mode='min', 
                      min_delta=1e-5, patience=PATIENCE,
                      restore_best_weights=True, verbose=1)
model.fit(X_train_pad, y_train,
          validation_data=(X_val_pad, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          callbacks=[estop], verbose=1)

Epoch 1/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 2s/step - accuracy: 0.5125 - loss: 0.6811 - val_accuracy: 0.5325 - val_loss: 0.6606
Epoch 2/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 2s/step - accuracy: 0.5431 - loss: 0.6451 - val_accuracy: 0.5420 - val_loss: 0.6519
Epoch 3/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 2s/step - accuracy: 0.5501 - loss: 0.6327 - val_accuracy: 0.5471 - val_loss: 0.6447
Epoch 4/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 2s/step - accuracy: 0.5618 - loss: 0.6224 - val_accuracy: 0.5459 - val_loss: 0.5923
Epoch 5/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 2s/step - accuracy: 0.7803 - loss: 0.4607 - val_accuracy: 0.9036 - val_loss: 0.2691
Epoch 6/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 2s/step - accuracy: 0.9112 - loss: 0.2344 - val_accuracy: 0.9188 - val_loss: 0.2392
Epoch 7/20
[1m206/206

<keras.src.callbacks.history.History at 0x7e4a3edf1490>

In [23]:
lstm_loss, lstm_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print("LSTM Test accuracy:", lstm_accuracy)
print("lSTM Test loss:", lstm_loss)

LSTM Test accuracy: 0.9350364804267883
lSTM Test loss: 0.2032657414674759


In [24]:
model.save(BASE_OUT+'simple_lstm_trainable.keras')