In [211]:
import numpy as np
import seaborn as sns
import pickle
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2

In [21]:
from utils import helpers as hp

### Create dataset

In [None]:
with open('../model_data/dataset.pkl', 'rb') as file:
    data_set_dict = pickle.load(file)

In [93]:
sentences = data_set_dict['sentences']
labels = data_set_dict['labels']

In [148]:
dataset = tf.data.Dataset.from_tensor_slices((sentences, labels))
shuffled_dataset = dataset.shuffle(buffer_size=20000)

#### Split dataset

In [149]:
train_size = int(0.8 * len(dataset))
train_dataset = shuffled_dataset.take(train_size)
test_dataset = shuffled_dataset.skip(train_size)

In [150]:
len(train_dataset), len(test_dataset)

(3967, 992)

#### Read in GLoVE

In [22]:
glove_path = '/projects/elopez22/AAW/glove/glove.840B.300d.txt'
glove_embeddings = hp.load_glove_embeddings(glove_path)

#### Prep data for the model

In [24]:
embedding_dim = 300

In [151]:
train_text_data = [x.numpy().decode('utf-8') for x, _ in train_dataset]
test_text_data = [x.numpy().decode('utf-8') for x, _ in test_dataset]

In [152]:
train_label_data = [label.numpy() for _, label in train_dataset]
test_label_data = [label.numpy() for _, label in test_dataset]

2024-12-02 10:21:08.567144: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [153]:
np.unique(train_label_data, return_counts = True)

(array([0, 1, 2], dtype=int32), array([2511,  485,  971]))

In [154]:
np.unique(test_label_data, return_counts = True)

(array([0, 1, 2], dtype=int32), array([638, 116, 238]))

In [155]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_text_data)
vocab_size = len(tokenizer.word_index) + 1

In [156]:
vocab_size

14146

Convert text to sequences

In [157]:
x_train = tokenizer.texts_to_sequences(train_text_data)
x_test = tokenizer.texts_to_sequences(test_text_data)

pad sequences

In [158]:
max_len = 478
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

Create embedding matrix

In [160]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in tokenizer.word_index.items():
    if index < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [161]:
train_labels_cat = to_categorical(train_label_data, num_classes=4) 

In [166]:
test_labels_cat = to_categorical(test_label_data, num_classes=4)

### Model

In [164]:
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False 
    ),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(4, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [165]:
history = model.fit(
    x_train, train_labels_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.5990 - loss: 0.4733 - val_accuracy: 0.6436 - val_loss: 0.3891
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.6251 - loss: 0.4002 - val_accuracy: 0.6436 - val_loss: 0.3926
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.6354 - loss: 0.3928 - val_accuracy: 0.6436 - val_loss: 0.3934
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.6167 - loss: 0.3987 - val_accuracy: 0.6436 - val_loss: 0.3917
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.6330 - loss: 0.3877 - val_accuracy: 0.6373 - val_loss: 0.4003
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.6252 - loss: 0.3860 - val_accuracy: 0.6373 - val_loss: 0.4001
Epoch 7/10
[1m100/100

In [None]:
model_1_cm, model_1_cmn,score = hp.determine_model_performance(model, x_test, test_label_data)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [214]:
model_1_cm

array([[495,  42, 101],
       [ 92,   4,  20],
       [174,  18,  46]])

In [215]:
model_1_cmn

array([[0.77586207, 0.06583072, 0.15830721],
       [0.79310345, 0.03448276, 0.17241379],
       [0.73109244, 0.07563025, 0.19327731]])

In [216]:
score

0.5493951612903226

In [193]:
model_2 = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False  # Freeze embeddings, or set to True to fine-tune
    ),
    Dropout(0.2),  # Add dropout after the embedding layer
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.2),  # Add dropout after the LSTM layer
    Dense(64, activation='relu'),
    Dropout(0.2),  # Add dropout after the Dense layer
    Dense(4, activation="sigmoid")
])
model_2.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])


In [195]:
history_2 = model_2.fit(
    x_train, train_labels_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6714 - loss: 0.3550 - val_accuracy: 0.6322 - val_loss: 0.4151
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6892 - loss: 0.3393 - val_accuracy: 0.6121 - val_loss: 0.4186
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6856 - loss: 0.3354 - val_accuracy: 0.5919 - val_loss: 0.4320
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.7079 - loss: 0.3171 - val_accuracy: 0.5982 - val_loss: 0.4467
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.7467 - loss: 0.2900 - val_accuracy: 0.5680 - val_loss: 0.4513
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.7802 - loss: 0.2579 - val_accuracy: 0.5378 - val_loss: 0.5047
Epoch 7/10
[1m100/100

In [None]:
model_2_cm, model_2_cmn, score = hp.determine_model_performance(model_2, x_test, test_label_data)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [218]:
model_2_cm

array([[495,  42, 101],
       [ 92,   4,  20],
       [174,  18,  46]])

In [219]:
model_2_cmn

array([[0.77586207, 0.06583072, 0.15830721],
       [0.79310345, 0.03448276, 0.17241379],
       [0.73109244, 0.07563025, 0.19327731]])

In [220]:
score

0.5493951612903226

In [202]:
model_3 = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False  # Freeze embeddings, or set to True to fine-tune
    ),
    Dropout(0.2),  # Dropout after embedding layer
    Bidirectional(LSTM(64, return_sequences=False, 
                       kernel_regularizer=l2(0.01))),  # L2 regularization
    Dropout(0.2),  # Dropout after LSTM
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),  # L2 on Dense layer
    Dropout(0.2),  # Dropout after Dense layer
    Dense(4, activation="sigmoid", kernel_regularizer=l2(0.01))  # L2 on final Dense
])
model_3.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [203]:
history_3 = model_3.fit(
    x_train, train_labels_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.5960 - loss: 4.2476 - val_accuracy: 0.6436 - val_loss: 0.7435
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6244 - loss: 0.6381 - val_accuracy: 0.6436 - val_loss: 0.4591
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6455 - loss: 0.4552 - val_accuracy: 0.6436 - val_loss: 0.4261
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6291 - loss: 0.4372 - val_accuracy: 0.6436 - val_loss: 0.4149
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6156 - loss: 0.4337 - val_accuracy: 0.6436 - val_loss: 0.4081
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6264 - loss: 0.4252 - val_accuracy: 0.6436 - val_loss: 0.4064
Epoch 7/10
[1m100/100

In [None]:
model_3_cm, model_3_cmn,score = hp.determine_model_performance(model_3, x_test, test_label_data)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [222]:
model_3_cm

array([[495,  42, 101],
       [ 92,   4,  20],
       [174,  18,  46]])

In [223]:
score

0.5493951612903226