In [1]:
import pandas as pd
import pickle
import keras
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import keras_tuner as kt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")

In [3]:
EPOCHS=20
BATCH_SIZE=64
PATIENCE=5

In [4]:
# BASE_IN = '/kaggle/input/'
# BASE_OUT = '/kaggle/working/'
BASE_IN = ''
BASE_OUT = 'artifacts/'

# Load Embeddings & Inputs

In [5]:
with open(BASE_IN+'artifacts/embeddings_inputs.pkl', 'rb') as f:
    loaded_input_items = pickle.load(f)

In [6]:
embedding_matrix = loaded_input_items['embedding_matrix']
X_train_pad = loaded_input_items['X_train_pad']
X_val_pad = loaded_input_items['X_val_pad']
X_test_pad = loaded_input_items['X_test_pad']
y_train = loaded_input_items['y_train']
y_val = loaded_input_items['y_val']
y_test = loaded_input_items['y_test']

In [7]:
VOCAB_SIZE = embedding_matrix.shape[0]
EMBEDDING_DIM = embedding_matrix.shape[1]
MAX_LEN = len(X_train_pad[0])
print(f"Embedding dimension: {EMBEDDING_DIM}\nVocab size: {VOCAB_SIZE}\nMaximum input length: {MAX_LEN}")

Embedding dimension: 500
Vocab size: 35756
Maximum input length: 588


# LSTM

* Each LSTM unit (that repeats for each time step) has 4 neural network layers interacting within.
* We have forget gate ($f_{t}$), input gate ($i_{t}$), and output gate ($o_{t}$), all of which are aggregations of the previous time step's hidden state and current time step's input with different weight matrices (and associated bias vectors). All these gates usually use sigmoid activation to output values between 0 and 1 indicating which information should be retained and which discarded.
* Then we have a layer that applies tanh activation function on another aggregation (because different weight matrix and bias vector), which outputs vector of candidate values ($\tilde{C}_{t}$) to add to the current time step's cell state.
* Then we find the new cell state ($C_{t}$) by doing pointwise addition of pointwise multiplications of $f_{t}$ and $C_{t-1}$ and $i_{t}$ and $\tilde{C}_{t}$.
* Finally we find the current time step's hidden state by doing pointwise mutliplication of $o_{t}$ and tanh($C_{t}$). 

## Trainable = False

In [8]:
model = Sequential()
model.add(Input(shape=(MAX_LEN,)))
model.add(Embedding(input_dim=VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    trainable=False))
# params => 35756*500 = 17878000
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
# params => ((500+128)*128+128)*4 = 322048
model.add(Dense(1, activation='sigmoid'))
# params => 128*1 + 1 = 129
model.summary()

I0000 00:00:1758466421.575205      74 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
estop = EarlyStopping(monitor='val_loss', mode='min', 
                      min_delta=1e-5, patience=PATIENCE,
                      restore_best_weights=True, verbose=1)
model.fit(X_train_pad, y_train,
          validation_data=(X_val_pad, y_val),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          callbacks=[estop], verbose=1)

Epoch 1/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 1s/step - accuracy: 0.5829 - loss: 0.6599 - val_accuracy: 0.7768 - val_loss: 0.5401
Epoch 2/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 1s/step - accuracy: 0.7404 - loss: 0.5650 - val_accuracy: 0.5484 - val_loss: 0.6680
Epoch 3/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 1s/step - accuracy: 0.6609 - loss: 0.6122 - val_accuracy: 0.8002 - val_loss: 0.4964
Epoch 4/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 1s/step - accuracy: 0.6948 - loss: 0.5930 - val_accuracy: 0.5642 - val_loss: 0.6398
Epoch 5/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 1s/step - accuracy: 0.5872 - loss: 0.6437 - val_accuracy: 0.7515 - val_loss: 0.5657
Epoch 6/20
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m279s[0m 1s/step - accuracy: 0.7079 - loss: 0.5558 - val_accuracy: 0.6682 - val_loss: 0.6622
Epoch 7/20
[1m206/206

<keras.src.callbacks.history.History at 0x7902284e9950>

In [10]:
lstm_loss, lstm_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print("LSTM Test accuracy:", lstm_accuracy)
print("lSTM Test loss:", lstm_loss)

LSTM Test accuracy: 0.9515815377235413
lSTM Test loss: 0.12178731709718704


In [12]:
model.save(BASE_OUT+'lstm.keras')

In [18]:
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(MAX_LEN,)))
    model.add(Embedding(input_dim=VOCAB_SIZE,
                        output_dim=EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        trainable=False))
    model.add(LSTM(units=hp.Choice('units', values=[64, 128, 256]), 
                   dropout=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1), 
                   recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.2, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-4, 1e-3, 1e-2])),
                                 loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [19]:
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=EPOCHS,
    factor=3,
    directory=BASE_OUT+'tuner',
    project_name='lstm_hyperband'
)

In [20]:
tuner.search(X_train_pad, y_train,
            validation_data=(X_val_pad, y_val),
            epochs=EPOCHS, batch_size=BATCH_SIZE,
            verbose=1)

Trial 5 Complete [00h 14m 45s]
val_accuracy: 0.5580900311470032

Best val_accuracy So Far: 0.8990267515182495
Total elapsed time: 01h 12m 41s

Search: Running Trial #6

Value             |Best Value So Far |Hyperparameter
128               |256               |units
0.2               |0.4               |dropout
0.4               |0.3               |recurrent_dropout
0.001             |0.01              |learning_rate
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/3
[1m 14/206[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m4:17[0m 1s/step - accuracy: 0.5010 - loss: 0.6961

KeyboardInterrupt: 

In [None]:
best_model1 = tuner.get_best_model(num_models=1)[0]
best_hps1 = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps1.values)

In [None]:
best_loss1, best_accuracy1 = best_model1.evaluate(X_test_pad, y_test, verbose=0)
print("Tuned LSTM (trainable=False) test accuracy:",best_accuracy1)
print("Tuned LSTM (trainable=False) test loss:", best_loss1)