In [1]:
import os

import keras_nlp
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import pandas as pd
from utils import *
import numpy as np

policy = keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

--ip=127.0.0.1
The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [2]:
train_labels, dev_labels, _ = load_labels()

In [3]:
label_replacement = {
    'OFF': 0,
    'NOT': 1,
}

In [4]:
# Preprocessing params.
BATCH_SIZE = 32
SEQ_LENGTH = 128

# Model params.
LSTM_UNITS = 64
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.2
NORM_EPSILON = 1e-5

# Training params.
LEARNING_RATE = 5e-5
EPOCHS = 3

In [5]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]

## Using Embedding Layer

In [6]:
df_train = pd.read_csv("../Data/PreprocessedData/train_preprocessed.csv")
df_test = pd.read_csv("../Data/PreprocessedData/test_preprocessed.csv")
df_val = pd.read_csv("../Data/PreprocessedData/val_preprocessed.csv")

df_train = df_train[['preprocessed_text', 'label']]
df_test = df_test[['preprocessed_text', 'label']]
df_val = df_val[['preprocessed_text', 'label']]

In [7]:
df_train = df_train[df_train.preprocessed_text.notna()]
df_val = df_val[df_train.preprocessed_text.notna()]
df_test = df_test[df_train.preprocessed_text.notna()]

  df_val = df_val[df_train.preprocessed_text.notna()]
  df_test = df_test[df_train.preprocessed_text.notna()]


In [8]:
vocab = df_train["preprocessed_text"].values.flatten().tolist()
vocab = ' '.join(vocab).split()
vocab_size = len(set(vocab))

In [9]:
# # Batch and shuffle the dataset
# train_ds = train_ds.batch(BATCH_SIZE).shuffle(10000)
# test_ds = test_ds.batch(BATCH_SIZE)
# val_ds = val_ds.batch(BATCH_SIZE).shuffle(10000)

In [10]:
X_train = df_train['preprocessed_text'].values
X_dev = df_val['preprocessed_text'].values
X_test = df_test['preprocessed_text'].values

In [11]:
tok = Tokenizer(num_words = vocab_size)
tok.fit_on_texts(X_train)

In [12]:
tokenized_train_text = tok.texts_to_sequences(X_train)
tokenized_dev_text = tok.texts_to_sequences(X_dev)

tokenized_train_text = pad_sequences(tokenized_train_text, SEQ_LENGTH)
tokenized_dev_text = pad_sequences(tokenized_dev_text, SEQ_LENGTH)

In [13]:
X_train = tf.convert_to_tensor(tokenized_train_text, dtype=tf.int64)
y_train = tf.convert_to_tensor(train_labels)

X_dev = tf.convert_to_tensor(tokenized_dev_text, dtype=tf.int64)
y_dev = tf.convert_to_tensor(dev_labels)

In [14]:
y_train

<tf.Tensor: shape=(10592,), dtype=int32, numpy=array([1, 1, 1, ..., 1, 1, 1])>

In [16]:
model = keras.Sequential()

# Take as input the tokenized input.
model.add(keras.Input(shape=(SEQ_LENGTH), dtype=tf.int32))

# Add an embedding layer
model.add(keras.layers.Embedding(vocab_size + 1, 128))

# Add 2 LSTM layers alongwith some Dropout
model.add(keras.layers.LSTM(128, activation = "relu", return_sequences=True))
model.add(keras.layers.Dropout(DROPOUT))
model.add(keras.layers.LSTM(LSTM_UNITS, activation = "relu"))
model.add(keras.layers.Dropout(DROPOUT))

# Predict an output label.
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    metrics=["accuracy"],
)

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 128)          2297088   
                                                                 
 lstm_2 (LSTM)               (None, 128, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 128, 128)          0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,478,145
Trainable params: 2,478,145
No

In [18]:
# Finetune the model for the SST-2 task.
model.fit(
    X_train, y_train, validation_data=(X_dev, y_dev), epochs=EPOCHS, batch_size = BATCH_SIZE
)

Epoch 1/3

In [None]:
tokenized_test_text = tok.texts_to_sequences(X_test)
tokenized_test_text = pad_sequences(tokenized_test_text, SEQ_LENGTH)
X_test = tf.convert_to_tensor(tokenized_test_text, dtype=tf.int64)

In [None]:
# Predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
val_pred = model.predict(X_dev)

# Convert predictions to labels
train_pred = np.where(train_pred > 0.5, 1, 0)
test_pred = np.where(test_pred > 0.5, 1, 0)
val_pred = np.where(val_pred > 0.5, 1, 0)



In [None]:
# Convert labels to numpy arrays
computeAllScores(train_pred, val_pred, test_pred)

Accuracy Train:  0.6709780966767371
Accuracy Dev:  0.6544561933534743
Accuracy Test:  0.7209302325581395
Weighted F1 Train:  0.5388599732280435
Weighted F1 Dev:  0.5177688121805848
Weighted F1 Test:  0.6040226272784412
Macro F1 Train:  0.4015481100627154
Macro F1 Dev:  0.39557178726318193
Macro F1 Test:  0.4189189189189189
Micro F1 Train:  0.6709780966767371
Micro F1 Dev:  0.6544561933534743
Micro F1 Test:  0.7209302325581395
Weighted Recall Train:  0.6709780966767371
Weighted Recall Dev:  0.6544561933534743
Weighted Recall Test:  0.7209302325581395
Macro Recall Train:  0.5
Macro Recall Dev:  0.5
Macro Recall Test:  0.5
Micro Recall Train:  0.6709780966767371
Micro Recall Dev:  0.6544561933534743
Micro Recall Test:  0.7209302325581395
Confusion Matrix Train: 
[[   0 3485]
 [   0 7107]]
Confusion Matrix Dev: 
[[   0  915]
 [   0 1733]]
Confusion Matrix Test: 
[[  0 240]
 [  0 620]]
