#### **NLP Final Project**

Wilson Neira

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Concatenate


1. Load and Preprocess Data

In [3]:
# Load Data
train_df = pd.read_csv("train_data_with_clusters.csv")
test_df = pd.read_csv("test_data_with_clusters.csv")

# Encode labels
le = LabelEncoder()
train_labels = le.fit_transform(train_df['label'])  # spam:1, ham:0
test_labels = le.transform(test_df['label'])

# Prepare tokenizer (fit on train)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['email'])

# Text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df['email'])
X_test_seq = tokenizer.texts_to_sequences(test_df['email'])

# Padding sequences
max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


2. Bi-LSTM Baseline (no clusters)

In [4]:
# Model definition
input_text = Input(shape=(max_len,))
embedding = Embedding(input_dim=5000, output_dim=128)(input_text)
x = Bidirectional(LSTM(64))(embedding)
output = Dense(1, activation='sigmoid')(x)

model_baseline = Model(inputs=input_text, outputs=output)
model_baseline.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model_baseline.fit(X_train_pad, train_labels, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate
predictions = (model_baseline.predict(X_test_pad) > 0.5).astype("int32")
print("Baseline Bi-LSTM Classification Report:")
print(classification_report(test_labels, predictions, target_names=le.classes_))


Epoch 1/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 145ms/step - accuracy: 0.8677 - loss: 0.2935 - val_accuracy: 0.9867 - val_loss: 0.0430
Epoch 2/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 139ms/step - accuracy: 0.9905 - loss: 0.0309 - val_accuracy: 0.9881 - val_loss: 0.0353
Epoch 3/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 147ms/step - accuracy: 0.9944 - loss: 0.0202 - val_accuracy: 0.9818 - val_loss: 0.0536
Epoch 4/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 153ms/step - accuracy: 0.9956 - loss: 0.0149 - val_accuracy: 0.9870 - val_loss: 0.0578
Epoch 5/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 156ms/step - accuracy: 0.9976 - loss: 0.0086 - val_accuracy: 0.9896 - val_loss: 0.0407
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step
Baseline Bi-LSTM Classification Report:
              precision    recall  f1-score   support

         

3. Bi-LSTM with K-Means Cluster Features

In [None]:
# Prepare cluster features (one-hot encoding)
cluster_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# KMeans clusters as example 
train_cluster_feat = cluster_encoder.fit_transform(train_df[['kmeans_cluster']])
test_cluster_feat = cluster_encoder.transform(test_df[['kmeans_cluster']])

# Model definition (text + cluster)
input_text = Input(shape=(max_len,))
embedding = Embedding(input_dim=5000, output_dim=128)(input_text)
x = Bidirectional(LSTM(64))(embedding)

# Cluster input
input_cluster = Input(shape=(train_cluster_feat.shape[1],))

# Concatenate clusters with Bi-LSTM output
concatenated = Concatenate()([x, input_cluster])

# Dense layers
output = Dense(1, activation='sigmoid')(concatenated)

model_clusters = Model(inputs=[input_text, input_cluster], outputs=output)
model_clusters.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model_clusters.fit(
    [X_train_pad, train_cluster_feat], 
    train_labels, 
    epochs=5, 
    batch_size=64, 
    validation_split=0.1
)

# Evaluate
predictions = (model_clusters.predict([X_test_pad, test_cluster_feat]) > 0.5).astype("int32")
print("Bi-LSTM + Clusters Classification Report:")
print(classification_report(test_labels, predictions, target_names=le.classes_))


Epoch 1/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 146ms/step - accuracy: 0.8921 - loss: 0.2321 - val_accuracy: 0.9889 - val_loss: 0.0376
Epoch 2/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 143ms/step - accuracy: 0.9912 - loss: 0.0281 - val_accuracy: 0.9863 - val_loss: 0.0366
Epoch 3/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 144ms/step - accuracy: 0.9947 - loss: 0.0170 - val_accuracy: 0.9878 - val_loss: 0.0434
Epoch 4/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 144ms/step - accuracy: 0.9963 - loss: 0.0109 - val_accuracy: 0.9874 - val_loss: 0.0370
Epoch 5/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 146ms/step - accuracy: 0.9973 - loss: 0.0093 - val_accuracy: 0.9855 - val_loss: 0.0453
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step
Bi-LSTM + Clusters Classification Report:
              precision    recall  f1-score   support

       

4. Bi-LSTM with Hierarchical Cluster Features

In [7]:
# Prepare cluster features (one-hot encoding)
cluster_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# KMeans clusters as example 
train_cluster_feat = cluster_encoder.fit_transform(train_df[['hierarchical_cluster']])
test_cluster_feat = cluster_encoder.transform(test_df[['hierarchical_cluster']])

# Model definition (text + cluster)
input_text = Input(shape=(max_len,))
embedding = Embedding(input_dim=5000, output_dim=128)(input_text)
x = Bidirectional(LSTM(64))(embedding)

# Cluster input
input_cluster = Input(shape=(train_cluster_feat.shape[1],))

# Concatenate clusters with Bi-LSTM output
concatenated = Concatenate()([x, input_cluster])

# Dense layers
output = Dense(1, activation='sigmoid')(concatenated)

model_clusters = Model(inputs=[input_text, input_cluster], outputs=output)
model_clusters.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model_clusters.fit(
    [X_train_pad, train_cluster_feat], 
    train_labels, 
    epochs=5, 
    batch_size=64, 
    validation_split=0.1
)

# Evaluate
predictions = (model_clusters.predict([X_test_pad, test_cluster_feat]) > 0.5).astype("int32")
print("Bi-LSTM + Hierarchical Clusters Classification Report:")
print(classification_report(test_labels, predictions, target_names=le.classes_))


Epoch 1/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 139ms/step - accuracy: 0.8949 - loss: 0.2359 - val_accuracy: 0.9874 - val_loss: 0.0378
Epoch 2/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 139ms/step - accuracy: 0.9919 - loss: 0.0268 - val_accuracy: 0.9896 - val_loss: 0.0301
Epoch 3/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 140ms/step - accuracy: 0.9957 - loss: 0.0145 - val_accuracy: 0.9904 - val_loss: 0.0307
Epoch 4/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 140ms/step - accuracy: 0.9971 - loss: 0.0101 - val_accuracy: 0.9900 - val_loss: 0.0312
Epoch 5/5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 146ms/step - accuracy: 0.9984 - loss: 0.0060 - val_accuracy: 0.9889 - val_loss: 0.0352
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 34ms/step
Bi-LSTM + Hierarchical Clusters Classification Report:
              precision    recall  f1-score   sup