## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch.nn.functional as F
import copy

## Model and DatasetLoader

In [2]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        # Initialize the dataset with texts, labels, tokenizer, and maximum sequence length
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        # Return the total number of samples
        return len(self.texts)

    def __getitem__(self, idx):
        # Get the text and label for the given index
        text = str(self.texts[idx])
        label = self.labels[idx]

    # Tokenize the text using the provided tokenizer
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Return a dictionary of input IDs, attention masks, and labels
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [5]:
def train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    scheduler,
    device,
    num_epochs=5,
    patience=2,
):
    # Initialize the best F1 score and patience counter
    best_f1 = 0
    patience_counter = 0
    best_model = None

    # Loop over each epoch
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

        # Iterate over batches in the training loader
        for batch in progress_bar:
            optimizer.zero_grad()  # Reset gradients

            # Move input data and labels to the specified device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass: compute model outputs and loss
            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            loss = outputs.loss

            loss.backward()  # Backward pass: compute gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()  # Update model parameters
            scheduler.step()  # Update learning rate schedule

            # Update progress bar with current loss
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
        
        # Validation phase
        model.eval()  # Set model to evaluation mode
        val_preds = []
        val_labels = []

        with torch.no_grad():
            # Iterate over batches in the validation loader
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"]

                # Forward pass: compute model outputs
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()  # Get predictions

                val_preds.extend(preds)  # Collect predictions
                val_labels.extend(labels.numpy())  # Collect true labels
        
        # Calculate F1 score for the current epoch
        current_f1 = f1_score(val_labels, val_preds)
        print(f"Epoch {epoch+1} - Validation F1: {current_f1:.4f}")

        # Check if the current F1 score is the best so far
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_model = copy.deepcopy(model)  # Save the best model
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1  # Increment patience counter

        # Early stopping if no improvement for a number of epochs equal to patience
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

    return best_model, best_f1  # Return the best model and its F1 score


## Loading Data

In [6]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [7]:
# Load and preprocess data
train_df = pd.read_csv(r"C:\Users\USER\Projects\NLP\Spam\data\SMS_train.csv", encoding='iso-8859-1')
test_df = pd.read_csv(r"C:\Users\USER\Projects\NLP\Spam\data\SMS_test.csv", encoding='iso-8859-1')

In [8]:
train_df.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [9]:
test_df.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam
3,4,URGENT! Your Mobile number has been awarded wi...,Spam
4,5,Someone has contacted our dating service and e...,Spam


In [10]:
# Map labels to numerical values
label_map = {"Non-Spam": 0, "Spam": 1}
train_df["label"] = train_df["Label"].map(label_map)
test_df["label"] = test_df["Label"].map(label_map)

In [11]:
train_df.head()

Unnamed: 0,S. No.,Message_body,Label,label
0,1,Rofl. Its true to its name,Non-Spam,0
1,2,The guy did some bitching but I acted like i'd...,Non-Spam,0
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam,0
3,4,Will ü b going to esplanade fr home?,Non-Spam,0
4,5,This is the 2nd time we have tried 2 contact u...,Spam,1


## Tokenizer & Device Setup

In [12]:
# Initialize tokenizer and device
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Model Initialization & Optimization

In [13]:
# Initialize Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []

In [14]:
# Iterate over each fold
for fold, (train_idx, val_idx) in enumerate(
    skf.split(train_df["Message_body"], train_df["label"])
):
    print(f"\nFold {fold + 1}")

    # Split the data into training and validation sets based on current fold indices
    train_texts = train_df["Message_body"].iloc[train_idx].values
    train_labels = train_df["label"].iloc[train_idx].values
    val_texts = train_df["Message_body"].iloc[val_idx].values
    val_labels = train_df["label"].iloc[val_idx].values

    # Create dataset objects for training and validation
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)

    # Create DataLoader objects for training and validation
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Initialize the BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )
    model.to(device)  # Move the model to the specified device

    # Initialize optimizer and learning rate scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer, start_factor=1.0, end_factor=0.01, total_iters=len(train_loader) * 5
    )

    # Train the model and obtain the best model and its F1 score
    best_model, fold_f1 = train_model(
        model, train_loader, val_loader, optimizer, scheduler, device
    )
    f1_scores.append(fold_f1)  # Record the F1 score for the current fold

    # Clean up to free memory
    del model, best_model
    torch.cuda.empty_cache()


Fold 1


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 48/48 [00:20<00:00,  2.40it/s, loss=0.0090]


Epoch 1 - Validation F1: 0.9388


Epoch 2: 100%|██████████| 48/48 [00:20<00:00,  2.29it/s, loss=0.0028]


Epoch 2 - Validation F1: 0.9388


Epoch 3: 100%|██████████| 48/48 [00:20<00:00,  2.34it/s, loss=0.0022]


Epoch 3 - Validation F1: 0.9388
Early stopping triggered

Fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 48/48 [00:21<00:00,  2.24it/s, loss=0.0682]


Epoch 1 - Validation F1: 0.9796


Epoch 2: 100%|██████████| 48/48 [00:22<00:00,  2.11it/s, loss=0.0057]


Epoch 2 - Validation F1: 1.0000


Epoch 3: 100%|██████████| 48/48 [00:33<00:00,  1.44it/s, loss=0.0016]


Epoch 3 - Validation F1: 1.0000


Epoch 4: 100%|██████████| 48/48 [00:21<00:00,  2.28it/s, loss=0.0015]


Epoch 4 - Validation F1: 1.0000
Early stopping triggered

Fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 48/48 [00:23<00:00,  2.06it/s, loss=0.0109]


Epoch 1 - Validation F1: 0.9583


Epoch 2: 100%|██████████| 48/48 [00:23<00:00,  2.07it/s, loss=0.0025]


Epoch 2 - Validation F1: 0.9583


Epoch 3: 100%|██████████| 48/48 [00:29<00:00,  1.61it/s, loss=0.0016]


Epoch 3 - Validation F1: 0.9583
Early stopping triggered

Fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 48/48 [00:31<00:00,  1.52it/s, loss=0.0089]


Epoch 1 - Validation F1: 0.9388


Epoch 2: 100%|██████████| 48/48 [00:31<00:00,  1.54it/s, loss=0.0026]


Epoch 2 - Validation F1: 0.9565


Epoch 3: 100%|██████████| 48/48 [00:34<00:00,  1.39it/s, loss=0.0017]


Epoch 3 - Validation F1: 0.9583


Epoch 4: 100%|██████████| 48/48 [00:22<00:00,  2.11it/s, loss=0.0010]


Epoch 4 - Validation F1: 0.9583


Epoch 5: 100%|██████████| 48/48 [00:37<00:00,  1.27it/s, loss=0.0010]


Epoch 5 - Validation F1: 0.9583
Early stopping triggered

Fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 48/48 [00:25<00:00,  1.88it/s, loss=0.0608]


Epoch 1 - Validation F1: 0.9565


Epoch 2: 100%|██████████| 48/48 [00:30<00:00,  1.55it/s, loss=0.0020]


Epoch 2 - Validation F1: 0.9565


Epoch 3: 100%|██████████| 48/48 [00:37<00:00,  1.27it/s, loss=0.0013]


Epoch 3 - Validation F1: 0.9583


Epoch 4: 100%|██████████| 48/48 [00:38<00:00,  1.25it/s, loss=0.0009]


Epoch 4 - Validation F1: 0.9583


Epoch 5: 100%|██████████| 48/48 [00:24<00:00,  1.98it/s, loss=0.0009]


Epoch 5 - Validation F1: 0.9583
Early stopping triggered


In [None]:
# Print cross-validation results
print(f"\nCross-validation F1 Scores: {f1_scores}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")



Cross-validation F1 Scores: [0.9387755102040817, 1.0, 0.9583333333333334, 0.9583333333333334, 0.9583333333333334]
Average F1 Score: 0.9628


## Final Model Training

In [16]:
# Train final model on the full training dataset
final_dataset = TextDataset(
    train_df["Message_body"].values, train_df["label"].values, tokenizer
)
final_loader = DataLoader(final_dataset, batch_size=16, shuffle=True)
test_dataset = TextDataset(
    test_df["Message_body"].values, test_df["label"].values, tokenizer
)
test_loader = DataLoader(test_dataset, batch_size=16)

In [17]:
# Initialize the final BERT model for sequence classification
final_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)
final_model.to(device)  # Move the final model to the specified device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [18]:

# Initialize optimizer and learning rate scheduler for the final model
optimizer = torch.optim.AdamW(final_model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer, start_factor=1.0, end_factor=0.01, total_iters=len(final_loader) * 5
)

In [19]:

# Split the final dataset into training and validation subsets for additional training
train_size = int(0.9 * len(final_dataset))
val_size = len(final_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(
    final_dataset, [train_size, val_size]
)
train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=16)

In [20]:
# Train the final model using the entire training data
final_model, _ = train_model(
    final_model, train_loader, val_loader, optimizer, scheduler, device
)

Epoch 1: 100%|██████████| 54/54 [00:24<00:00,  2.23it/s, loss=0.0304]


Epoch 1 - Validation F1: 0.9302


Epoch 2: 100%|██████████| 54/54 [00:22<00:00,  2.37it/s, loss=0.0030]


Epoch 2 - Validation F1: 0.9524


Epoch 3: 100%|██████████| 54/54 [00:28<00:00,  1.88it/s, loss=0.0017]


Epoch 3 - Validation F1: 0.9524


Epoch 4: 100%|██████████| 54/54 [00:24<00:00,  2.21it/s, loss=0.0011]


Epoch 4 - Validation F1: 0.9524
Early stopping triggered


## Final Model Prediction

In [21]:
# Generate predictions for the test set
final_model.eval()  # Set the model to evaluation mode
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass: compute model outputs
        outputs = final_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()  # Get predicted labels
        test_preds.extend(preds)  # Collect predictions


In [23]:

# Create a submission DataFrame with predicted labels
submission = pd.DataFrame(
    {
        "S. No.": test_df["S. No."],
        "Label": ["Spam" if pred == 1 else "Non-Spam" for pred in test_preds],
    }
)
# Save predictions to CSV
submission.to_csv(r"C:\Users\USER\Projects\NLP\Spam\data\submission.csv", index=False)