In [1]:
from utils import *
import numpy as np
import os
import matplotlib.pyplot as plt

### Open the preprocessed data, split into train, val and test

In [3]:
data = pd.read_csv("./data/spam_preprocessed.csv")
test_data=pd.read_csv("./data/test_data.csv")
gen_train=pd.read_csv("./data/generated_train_data.csv")
data=pd.concat([data, gen_train], ignore_index=True).sample(frac=1).reset_index(drop=True, )
train_data, val_data = train_test_split(data, test_size=0.1, random_state=2023)

print(train_data.shape, val_data.shape,test_data.shape)

(5735, 2) (638, 2) (160, 2)


### Initialize embedding model

In [5]:
embedding_model = BertModel.from_pretrained(
    "bert-base-uncased", output_hidden_states=True
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Get embeddings from from the data

In [None]:
X_train = create_embeddings(list(train_data["text"].values), embedding_model)
X_val = create_embeddings(list(val_data["text"].values), embedding_model)
X_test = create_embeddings(list(test_data["text"].values), embedding_model)

y_train = train_data["ham_spam_encoded"]
y_test = test_data["ham_spam_encoded"]
y_val = val_data["ham_spam_encoded"]

In [None]:
# creating fc model and weights
model_fc = fc_model_softmax(input_num=768)
weights_path = f"model/model.hdf5"  # path where will save model weights
model_fc.save_weights(
    weights_path
)  
# if we want to cancel learning and start from 0, if not comment the line
model_fc.load_weights(weights_path)

In [None]:
# defining epochs count, batch size and learning rate
epochs = 15
batch_size = 64
learning_rate = 0.001

In [None]:
X_test = to_tf(X_test)
X_train = to_tf(X_train)
X_val = to_tf(X_val)

In [None]:
data = (X_train, X_val, y_train, y_val)
history = trainer(
    model_fc, data, weights_path, batch_size, epochs, learning_rate=learning_rate
)

In [None]:
tp, tn, fp, fn = get_metrics(model_fc, weights_path, X_test, y_test)

print(f"----- Accuracy = \t{(tp+tn)/(tp + tn + fp + fn):.2%} -----")
print(f"----- Precision = \t{tp/(tp + fp):.2%} -----")
print(f"----- Recall =  \t{tp/(tp + fn):.2%} -----")

### Training and validation loss

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.legend(["train", "val"])
plt.xlabel("epochs")
plt.ylabel("value")
plt.show()

### Confusion matrix

In [None]:
confusion_matrix = np.array([[tn, fp], [fn, tp]])

plt.figure(figsize=(6, 4))
class_labels = ["ham", "spam"]
sns.heatmap(
    confusion_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_labels,
    yticklabels=class_labels,
)
plt.show()