This file forecasts derailment/subreddit using BERT.

# 1. Install requirements

In [61]:
model_name = "bert-base-uncased"
test_size = 0.2
labels = [True, False]  # replace with your real labels
num_labels = len(labels)
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# 2. Read and Prepare the Dataset

In [62]:
import pandas as pd
import pickle
import os

In [63]:
corpus = os.path.join('..', '..', 'Data', 'processed', 'email_label_dict_balanced.pkl')

In [64]:
# Load the balanced dictionary
with open(corpus, 'rb') as f:
    emails_and_tag = pickle.load(f)

print(f"Loaded {len(emails_and_tag):,} emails")

# Check class distribution
casual_count = sum(1 for v in emails_and_tag.values() if v == 0)
business_count = sum(1 for v in emails_and_tag.values() if v == 1)
print(f"Casual: {casual_count:,} | Business: {business_count:,}")
print(f"Ratio: 1:{business_count/casual_count:.1f}")


Loaded 28,536 emails
Casual: 14,268 | Business: 14,268
Ratio: 1:1.0


In [65]:
import random

emails = []
is_business = []

for key, value in emails_and_tag.items():
    emails.append(key)
    is_business.append(value) 


# Zip the lists together
combined = list(zip(emails, is_business))

# Shuffle the combined list
random.shuffle(combined)

# Unzip back into two lists
email_shuffled, is_buisness_shuffled = zip(*combined)

# Convert back to lists (optional)
email_shuffled = list(email_shuffled)
is_buisness_shuffled = list(is_buisness_shuffled)

pre_dataframe = {"email": email_shuffled, "labels":is_buisness_shuffled}
df = pd.DataFrame(pre_dataframe)

In [66]:
df.head()

Unnamed: 0,email,labels
0,---------------------- Forwarded by Matthew Le...,1
1,Subject: EBS GA Conference Call\n\nOur weekly ...,1
2,Subject: Re: morning 4/30\n\nI'm glad you had ...,0
3,"Subject: RE: ripple claims\n\nSteve,\n\tWe are...",1
4,Subject: Re: Mary Kay Makeup\n\nI have revised...,0


### Label Encoder

In [67]:
from sklearn import preprocessing

In [68]:
# le = preprocessing.LabelEncoder()
# le.fit(df["derails"].tolist())
# df['label'] = le.transform(df["derails"].tolist())

In [69]:
df.head()

Unnamed: 0,email,labels
0,---------------------- Forwarded by Matthew Le...,1
1,Subject: EBS GA Conference Call\n\nOur weekly ...,1
2,Subject: Re: morning 4/30\n\nI'm glad you had ...,0
3,"Subject: RE: ripple claims\n\nSteve,\n\tWe are...",1
4,Subject: Re: Mary Kay Makeup\n\nI have revised...,0


### Train/Test/Val Split

In [70]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [71]:
# First split off 20% for test
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Now split the remaining 80% into 60% train and 20% val
# 20% out of the remaining 80% is 0.25 of that chunk
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=42)

# Hugging Face format
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train.reset_index(drop=True)),
    "validation": Dataset.from_pandas(df_val.reset_index(drop=True)),
    "test": Dataset.from_pandas(df_test.reset_index(drop=True)),
})


### Convert to correct dataframe

In [72]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]


### Tokenizer

In [73]:
from transformers import AutoTokenizer

In [74]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["email"], truncation=True)

In [75]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 17121/17121 [00:03<00:00, 5163.90 examples/s]


In [76]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5708/5708 [00:01<00:00, 5546.60 examples/s]


# 3. Initialize Model

In [77]:
from transformers import AutoModelForSequenceClassification

In [78]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 4. Train model

In [79]:
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [80]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [81]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
    return {"eval_accuracy": acc}

In [82]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    logging_strategy="epoch",
    eval_strategy="epoch",                 # âœ… clean & modern
    save_strategy="epoch",                 # âœ… matches eval
    save_total_limit=2,                    # âœ… keeps best 2 checkpoints
    load_best_model_at_end=True,           # âœ… will restore best version
    metric_for_best_model="eval_accuracy", # âœ… or use "eval_accuracy"
    greater_is_better=True,                # âœ… for accuracy
    report_to="none",
    learning_rate=3e-5,
    per_device_train_batch_size=128,       # for speed
    per_device_eval_batch_size=256,        # speed
    fp16=True,                             # speed
    warmup_steps=500,
    weight_decay=0.05,
)

trainer = Trainer(
    # model=model,
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]  # ðŸ‘ˆ Add this line
)


  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
def compute_objective(metrics):
    return metrics["eval_accuracy"]


Now re-generate the model with the best hyperparameters

In [84]:
# 4. Retrain and save
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.598,0.451259,0.807814
2,0.3747,0.324563,0.871759
3,0.2817,0.273986,0.894008
4,0.2153,0.256921,0.89506
5,0.1413,0.301293,0.901016
6,0.0853,0.322288,0.902242
7,0.0481,0.396142,0.898914
8,0.0299,0.417348,0.908199
9,0.0173,0.441817,0.90925
10,0.0108,0.447068,0.908024


TrainOutput(global_step=1340, training_loss=0.18024753145317532, metrics={'train_runtime': 417.9101, 'train_samples_per_second': 409.681, 'train_steps_per_second': 3.206, 'total_flos': 4.50472437881856e+16, 'train_loss': 0.18024753145317532, 'epoch': 10.0})

In [85]:
# trainer.save_model('BERT')

# 5. Evaluate Model

In [86]:
from sklearn.metrics import classification_report

In [87]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['labels'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8558
           1       1.00      1.00      1.00      8563

    accuracy                           1.00     17121
   macro avg       1.00      1.00      1.00     17121
weighted avg       1.00      1.00      1.00     17121



In [90]:
# output = trainer.predict(tokenized_test)
# logits = output.predictions  # raw model output before argmax
# preds = np.argmax(logits, axis=1)  # class with highest score

# GT = df_val['labels'].tolist()  
# print(classification_report(GT, preds))

output = trainer.predict(tokenized_test)

logits = output.predictions
preds = np.argmax(logits, axis=1)

# Use test set labels, not val set
GT = df_test['labels'].tolist()  # <-- Change df_val to df_test

print(classification_report(GT, preds))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      2832
           1       0.92      0.89      0.91      2876

    accuracy                           0.91      5708
   macro avg       0.91      0.91      0.91      5708
weighted avg       0.91      0.91      0.91      5708



In [91]:
trainer.evaluate()

{'eval_accuracy': 0.909250175192712,
 'eval_loss': 0.44181710481643677,
 'eval_runtime': 4.8771,
 'eval_samples_per_second': 1170.358,
 'eval_steps_per_second': 4.716,
 'epoch': 10.0}

In [92]:
trainer.save_model("./my_email_classifier")
tokenizer.save_pretrained("./my_email_classifier")

('./my_email_classifier/tokenizer_config.json',
 './my_email_classifier/special_tokens_map.json',
 './my_email_classifier/vocab.txt',
 './my_email_classifier/added_tokens.json',
 './my_email_classifier/tokenizer.json')