<a href="https://colab.research.google.com/github/brianbamboo/email-classification-bert/blob/main/email_classification_bert_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Email Classification with BERT (Hugging Face Transformers)


In [31]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

First, let's load the dataset.

In [2]:
colab_dataset_path = 'drive/MyDrive/colab/datasets/'

In [3]:
df = pd.read_csv(colab_dataset_path + 'spam.csv', encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df['label'] = df['v1'].map({'spam': 1, 'ham': 0})  # Adjust as needed

It looks like there is text in columns 2-5, but they are split up unnecessarily. We will concatenate them using a lambda function and then select just `label` and `text` columns for further processing.

In [6]:
# Concatenate columns 2 thru 4
df['text'] = df.iloc[:,1:5].apply(lambda x: ','.join(x.dropna()), axis=1)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,label,text
0,ham,"Go until jurong point, crazy.. Available only ...",,,,0,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...,,,,0,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,,,,0,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
processed_df = df.loc[:, ['label', 'text']]
processed_df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Split dataset into test and training sets

In [8]:
# Step 1: Split raw data
train_df, test_df = train_test_split(processed_df, test_size=0.2, stratify=processed_df['label'])

# Step 2: Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Step 3: Combine into DatasetDict
raw_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Tokenize dataset

In [23]:
# Create BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Use TinyBERT because I'm poor
# tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-2_H-128_A-2')

model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [24]:
# Padding and truncation are ways to ensure that all of the input sequences are the same fixed length.
# (Pre-trained) BertTokenizer specifically requires input sequences of 512 tokens.
# If the input sequence is shorter it should be padded
# If it is longer it should be truncated, otherwise the tokenizer will run into an error.
def tokenize(example):
    return tokenizer(example['text']
                     , padding='max_length'
                     , truncation=True
                     , max_length=63)


tokenized_dataset = raw_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

# What is a Hugging Face dataset?

A Hugging Face dataset is a structured collection of data, primarily for machine learning, organized in a SQL-like format and accessible through the Hugging Face platform and library. It provides a central hub for users to share, collaborate on, and access datasets for various machine learning tasks, particularly in the fields of NLP, computer vision, and audio.

In [25]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4457
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1115
    })
})

* `input_ids` - array of integers identifying the tokens in the input sequence themselves (numerically)
* `token_type_ids` - array of integers (0, 1) identifying/separating sequences when necessary (i.e. if a question and answer is in the same input sequence the question might be marked with 0s and the answer with 1s)
* `attention_mask` - array of 0s and 1s where 0s represent padding, 1s represent "real" tokens

In [26]:
# The Tokenized dataset is essentially a list of dictionaries containing the dataset.
for i in tokenized_dataset['train'][0]:
    print(i, '\t\n\n', tokenized_dataset['train'][0][i], '\n\n')

label 	

 1 


text 	

 You have 1 new voicemail. Please call 08719181503 


__index_level_0__ 	

 1195 


input_ids 	

 [101, 2017, 2031, 1015, 2047, 2376, 21397, 1012, 3531, 2655, 5511, 2581, 16147, 15136, 16068, 2692, 2509, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


token_type_ids 	

 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


attention_mask 	

 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 




# What is a BertTokenizer?

Anyway, before proceeding I should at least try to understand what the `BertTokenizer` did? Let me do some research on what a `BertTokenizer` is.

A BERT tokenizer is a component that prepares text for processing by a BERT model. It's essentially a translator that converts words and phrases into numerical representations (tokens) that BERT can understand for use for tasks like language undersatnding and generation.

### Hold on... what is BERT?

BERT stands for Bidirectional Encoder Representations from Transformers and it is a powerful language model in Natural Language Processing (NLP) designed to understand the context of text by considering both left and right context. It leverages the transformer architecture to create deep bidirectional representations of text, enabling it to excel at all various NLP tasks.

Okay wait, this is bringing back some memories of my machine learning + AI course where I did an NLP project and implemented byte pair encoding from scratch. Wow, I really am rusty, eh? Anywy, back to the original question.

# Back to BertTokenizer

BERT's tokenizer uses the WordPiece algorithm, which breaks down words into subword units, allowing the model to handle out-of-vocabulary words and more effectively represent related words.

## What is the difference between WordPiece and BPE?

BPE is one of the subword tokenization algorithms I used in my machine translation project in grad school. I am curious how it differs from WordPiece. It looks like they differ in the following areas:

### BPE (Byte Pair Encoding)
* Chooses merges based on optimizing $P(A,B)$.
* Chooses merges based on finding the most frequent byte pair.

### Wordpiece
* Is a modified version of BPE (byte pair encoding).
* Chooses merges based on optimizing $$\frac{P(A,B)}{P(A)*P(B)}$$
* At a high level the intuition is if the byte tuple AB appears together more frequently relative to the frequency of the individual letters appearing separately, then it is more likely that this byte pair should be merged. However, if the individual letters appear separately way more frequently relatively to the pair, then it is less likely that these should be merged relative to another byte pair.

In [27]:
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Define training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)

In [29]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

In [30]:
# Train dataset
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.071823
2,0.141900,0.058353
3,0.141900,0.05789


TrainOutput(global_step=837, training_loss=0.10417868045638254, metrics={'train_runtime': 201.4431, 'train_samples_per_second': 66.376, 'train_steps_per_second': 4.155, 'total_flos': 2090281209660.0, 'train_loss': 0.10417868045638254, 'epoch': 3.0})

In [32]:
predictions = trainer.predict(tokenized_dataset["test"])
preds = np.argmax(predictions.predictions, axis=1)
print(classification_report(tokenized_dataset["test"]["label"], preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.95      0.95      0.95       149

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

