In [1]:
import joblib
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import json
import pandas as pd
import os
pd.set_option('display.max_colwidth', None)

Train Test Split

In [2]:
df = pd.DataFrame()
DATA_FOLDER="refined_labels"
DATA_FILE="final_labels.json"
subfolders=[f for f in os.listdir(DATA_FOLDER) if os.path.isdir(os.path.join(DATA_FOLDER, f))]
for subfolder in subfolders:
    subfolder_path=os.path.join(DATA_FOLDER,subfolder)
    data_file_path=os.path.join(subfolder_path,DATA_FILE)
    with open(data_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    df = pd.concat([pd.DataFrame(data),df],axis=0)


# clean data
df = df[df["label"] != "skip"]
df = df[["message", "label"]]
df["label"] = df["label"].map({"spam": 1, "ham": 0})


X = df["message"]
y = df["label"]
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle = True)
X_train, X_val, y_train, y_val =  train_test_split(X_trainval, y_trainval, test_size=0.125, random_state=41, shuffle = True)

The Baseline Model (Naive Bayes)

In [10]:
baseline = MultinomialNB()
vectorizer = TfidfVectorizer(stop_words="english", max_features=2000)
pipeline = make_pipeline(vectorizer, baseline)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

print(f"\n===== {"Naive Bayes"} =====")
print(classification_report(y_val, y_pred))


===== Naive Bayes =====
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       544
           1       1.00      0.43      0.60        21

    accuracy                           0.98       565
   macro avg       0.99      0.71      0.79       565
weighted avg       0.98      0.98      0.97       565



Model selection

Some simple models

In [None]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Linear SVM": LinearSVC()
}


for name, model in models.items():
    pipeline = make_pipeline(vectorizer, model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)

    print(f"\n===== {name} =====")
    print(classification_report(y_val, y_pred))



===== Naive Bayes =====
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       544
           1       1.00      0.43      0.60        21

    accuracy                           0.98       565
   macro avg       0.99      0.71      0.79       565
weighted avg       0.98      0.98      0.97       565


===== Random Forest =====
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       544
           1       0.94      0.81      0.87        21

    accuracy                           0.99       565
   macro avg       0.97      0.90      0.93       565
weighted avg       0.99      0.99      0.99       565


===== Linear SVM =====
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       544
           1       1.00      0.76      0.86        21

    accuracy                           0.99       565
   macro avg       1.00      0.88      0.93       56

Both Linear SVM and Random Forest behaves better than the baseline model. However, recall is more important in spam filtering, so Linear SVM behaves better than Random Forest.

More advance model

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128)
        self.labels = list(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [5]:
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)


In [6]:
training_args = TrainingArguments(
    output_dir="./bert_output",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)




In [7]:
trainer.train()


  0%|          | 0/990 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

{'eval_loss': 0.06882470101118088, 'eval_runtime': 157.7793, 'eval_samples_per_second': 3.581, 'eval_steps_per_second': 0.45, 'epoch': 1.0}
{'loss': 0.115, 'grad_norm': 0.059612274169921875, 'learning_rate': 9.8989898989899e-06, 'epoch': 1.01}


  0%|          | 0/71 [00:00<?, ?it/s]

{'eval_loss': 0.04660175368189812, 'eval_runtime': 99.9497, 'eval_samples_per_second': 5.653, 'eval_steps_per_second': 0.71, 'epoch': 2.0}
{'train_runtime': 7106.3028, 'train_samples_per_second': 1.113, 'train_steps_per_second': 0.139, 'train_loss': 0.08548975279836944, 'epoch': 2.0}


TrainOutput(global_step=990, training_loss=0.08548975279836944, metrics={'train_runtime': 7106.3028, 'train_samples_per_second': 1.113, 'train_steps_per_second': 0.139, 'total_flos': 520302111974400.0, 'train_loss': 0.08548975279836944, 'epoch': 2.0})

In [8]:
pred = trainer.predict(val_dataset)
y_pred_bert = pred.predictions.argmax(axis=-1)
print("\n===== BERT Transformer =====")
print(classification_report(y_val, y_pred_bert))

  0%|          | 0/71 [00:00<?, ?it/s]


===== BERT Transformer =====
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       544
           1       1.00      0.67      0.80        21

    accuracy                           0.99       565
   macro avg       0.99      0.83      0.90       565
weighted avg       0.99      0.99      0.99       565



The bert model performs no better than the linear SVM model, probably because the dataset is not large enough. We adopt the linear SVM model as our final model.

In [12]:
final_model = LinearSVC()
final_pipeline = make_pipeline(vectorizer, final_model)
final_pipeline.fit(X_train, y_train)
y_pred = final_pipeline.predict(X_test)

print(f"\n===== {"Linear SVM"} =====")
print(classification_report(y_test, y_pred))


===== Linear SVM =====
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1083
           1       0.95      0.79      0.86        47

    accuracy                           0.99      1130
   macro avg       0.97      0.89      0.93      1130
weighted avg       0.99      0.99      0.99      1130

