In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [15]:
#Data Preprocessing

df_train = pd.read_csv("/kaggle/input/email-spam-classification/email_spam.csv")
df_train_label = df_train.iloc[:, -1:]

label_encoder = LabelEncoder()
y_train_encoded = df_train_label.apply(lambda col: label_encoder.fit_transform(col) if col.dtypes == 'object' else col)
df_train['label'] = y_train_encoded

df_train['text'] = df_train['title'] + " " + df_train['text']
x_train_data = df_train.text

x_train, x_val, y_train, y_val = train_test_split(
    df_train['text'], df_train['label'], test_size=0.2, random_state=42
)

In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


#Tokenizes the entire dataframe
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# Accuracy calculation function
def calculate_accuracy(preds, labels):
    _, pred_labels = torch.max(preds, dim=1) #First one is the raw data and second is the prediction
    correct_predictions = (pred_labels == labels).float()
    accuracy = correct_predictions.sum() / len(correct_predictions)
    return accuracy



# Tokenizer and model setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Tokenized data
train_dataset = EmailDataset(x_train.to_numpy(), y_train.to_numpy(), tokenizer, max_len=128)
val_dataset = EmailDataset(x_val.to_numpy(), y_val.to_numpy(), tokenizer, max_len=128)


train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_accuracy = 0

    for batch in train_dataloader:
        optimizer.zero_grad() #Reset Gradient

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss #Retrieves from loss function and updates weights
        logits = outputs.logits

        # Calculate accuracy
        accuracy = calculate_accuracy(logits, labels)
        total_loss += loss.item()
        total_accuracy += accuracy.item()

        loss.backward() #Back propagation
        optimizer.step() #Update Model Parameter

    avg_loss = total_loss / len(train_dataloader)
    avg_accuracy = total_accuracy / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}, Accuracy: {avg_accuracy}")
    torch.save(model.state_dict(), 'spam_email_nlp.pth')

/kaggle/input/email-spam-classification/email_spam.csv


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.6831975817680359, Accuracy: 0.5666666686534881
Epoch 2/5, Loss: 0.5862083315849305, Accuracy: 0.7083333373069763
Epoch 3/5, Loss: 0.437084949016571, Accuracy: 0.8
Epoch 4/5, Loss: 0.34319655895233153, Accuracy: 0.9
Epoch 5/5, Loss: 0.24039712250232698, Accuracy: 0.9625
