In [1]:
from IPython.display import clear_output

In [2]:
!pip install transformers
clear_output()

In [3]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
import seaborn as sns

import transformers
from transformers import AutoModel, BertTokenizerFast

In [7]:
device = torch.device("cuda")
dataset_path = '/content/spam.csv'

## Loading Dataset

In [11]:
df = pd.read_csv(dataset_path , encoding='latin-1' )
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Replace ham with 0 and spam with 1
df = df.replace(['ham','spam'],[0, 1])

In [14]:
# split train dataset into train, validation and test sets
(train_text,
 temp_text,
 train_labels,
 temp_labels) = train_test_split(
     df['v2'], df['v1'],
     random_state=42,
     test_size=0.2,
     stratify=df['v1'])


(val_text,
 test_text,
 val_labels,
 test_labels) = train_test_split(
     temp_text,temp_labels,
     random_state=10,
     test_size=0.5,
     stratify=temp_labels)

In [15]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
seq_len = [len(i.split()) for i in df['v2']]
seq_len = pd.Series(seq_len)
padding_length = int((seq_len.max() + seq_len.min())/2)

In [18]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = padding_length,
    padding='max_length',
    truncation=True
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = padding_length,
    padding='max_length',
    truncation=True
)

tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = padding_length,
    padding='max_length',
    truncation=True
)

In [19]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [20]:
batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

## Model Architecture

In [21]:
# Freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [22]:
class BERT_SpamClassifier(nn.Module):
    def __init__(self, bert):
        super(BERT_SpamClassifier, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, 512)  # Output from BERT is 768
        self.fc2 = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        bert_output = self.bert(sent_id, attention_mask=mask)
        pooled_output = bert_output.pooler_output
        x = self.fc1(pooled_output)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [23]:
spamClassifier = BERT_SpamClassifier(bert)
spamClassifier = spamClassifier.to(device)

In [24]:
optimizer = AdamW(spamClassifier.parameters(), lr=2e-5)

In [26]:
# Addressing the Class Imbalance in the Dataset Observed Earlier
# Computing the class weights will help handle the imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
print("Class Weights:", class_weights)

Class Weights: [0.57748121 3.72658863]


In [27]:
# Converting list of class weights to a tensor
weights = torch.tensor(class_weights, dtype=torch.float)
weights = weights.to(device)

# Define the loss function with class weights
# Using Negative Log Likelihood Loss (NLLLoss) since the model output is LogSoftmax
cross_entropy = nn.NLLLoss(weight=weights)

# Define the number of epochs for training
epochs = 15

In [29]:
def train():
    # Set the model to train mode
    spamClassifier.train()
    total_loss, total_accuracy = 0, 0
    # Empty list to save classifier predictions
    total_preds = []

    # Iterate over batches
    for step, batch in enumerate(train_dataloader):
        # Progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # Push the batch to GPU
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        # Clear previously calculated gradients
        spamClassifier.zero_grad()

        # Get classifier predictions for the current batch
        preds = spamClassifier(sent_id, mask)

        # Compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # Add to the total loss
        total_loss += loss.item()

        # Backward pass to calculate the gradients
        loss.backward()

        # Clip the gradients to prevent the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(spamClassifier.parameters(), 1.0)

        # Update parameters
        optimizer.step()

        # Move predictions from GPU to CPU
        preds = preds.detach().cpu().numpy()

        # Append the classifier predictions
        total_preds.append(preds)

    # Compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # Reshape the predictions to (number of samples, number of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # Return the loss and predictions
    return avg_loss, total_preds

In [30]:
def evaluate():
    print("\nEvaluating...")
    # Deactivate dropout layers
    spamClassifier.eval()
    total_loss = 0
    # Empty list to save the classifier predictions
    total_preds = []

    # Iterate over batches
    for step, batch in enumerate(val_dataloader):
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # Push the batch to GPU
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        # Deactivate autograd
        with torch.no_grad():
            # Classifier predictions
            preds = spamClassifier(sent_id, mask)

            # Compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()

            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    # Compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # Reshape the predictions to (number of samples, number of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [31]:
# Set initial loss to infinite
best_valid_loss = float('inf')

# Empty lists to store training and validation loss of each epoch
train_losses = []
valid_losses = []

# For each epoch
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    # Train spamClassifier
    train_loss, _ = train()

    # Evaluate spamClassifier
    valid_loss, _ = evaluate()

    # Save the best spamClassifier
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(spamClassifier.state_dict(), 'saved_weights.pt')

    # Append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.646
Validation Loss: 0.590

 Epoch 2 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.543
Validation Loss: 0.499

 Epoch 3 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.460
Validation Loss: 0.435

 Epoch 4 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.410
Validation Loss: 0.390

 Epoch 5 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.356
Validation Loss: 0.351

 Epoch 6 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.328
Validation Loss: 0.329

 Epoch 7 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.307
Validation Loss: 0.306

 Epoch 8 / 15
  Batch    50  of    140.
  Batch   100  of    140.

Evaluating...

Training Loss: 0.288
Validat

In [32]:
#load weights of best model
path = 'saved_weights.pt'
spamClassifier.load_state_dict(torch.load(path))

<All keys matched successfully>

In [33]:
# get predictions for test data
with torch.no_grad():
  preds = spamClassifier(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [35]:
predictions = np.argmax(preds, axis = 1)

In [37]:
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96       483
           1       0.70      0.99      0.82        75

    accuracy                           0.94       558
   macro avg       0.85      0.96      0.89       558
weighted avg       0.96      0.94      0.94       558



In [39]:
print(confusion_matrix(test_y, predictions))

[[451  32]
 [  1  74]]


In [40]:
# Save the entire model
torch.save(spamClassifier, 'my_spam_classifier_model.pth')