In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/multiclass-email/final_combined.csv
/kaggle/input/multiclass-email/email_data_processed.csv
/kaggle/input/multiclass-email/finalTemp.csv


In [2]:
import pandas as pd

# df1 = pd.read_csv("/kaggle/input/multiclass-email/email_data_processed.csv")
# df2 = pd.read_csv("/kaggle/input/multiclass-email/finalTemp.csv")

In [3]:
# df1

In [4]:
# df2

In [5]:
# df = pd.concat([df1, df2]).reset_index(drop = True)

In [6]:
df = pd.read_csv("/kaggle/input/multiclass-email/final_combined.csv")
df = df.drop_duplicates(subset=["email"]).reset_index(drop=True)
df

Unnamed: 0,email,category
0,Congratulations! You've won a $1000 Walmart gi...,spam
1,Join us for a special event this weekend!,social
2,Limited time offer: 50% off your next purchase!,promotional
3,"Hey, just checking in to see how you're doing.",personal
4,Your bank statement is ready for review.,finance
...,...,...
2998,Please confirm your email address to continue ...,important
2999,Your scheduled payment is due in 2 days. Check...,important
3000,Your account requires immediate attention. Ple...,important
3001,Your recent purchase receipt is available. Che...,finance


In [7]:
def remove_subject_prefix(text):
    # Check if the text starts with "Subject:" (case-insensitive)
    if text.lower().startswith("subject:"):
        # Remove the prefix and strip leading whitespace
        return text[8:].lstrip()  # 8 is the length of "Subject:"
    return text

df["email"] = df["email"].apply(remove_subject_prefix) 

In [8]:
df["email"] = df["email"].apply(lambda x: x.strip())
df['category'] = df["category"].apply(lambda x: x.strip())

df["category"].unique()

array(['spam', 'social', 'promotional', 'personal', 'finance',
       'important'], dtype=object)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Combine 'Subject' and 'Text' columns
# df['content'] = df['Subject'] + ' ' + df['Text']

# Replace missing data with empty string
df['email'].fillna('', inplace=True)

# Convert labels to numerical values
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['email'].fillna('', inplace=True)


Unnamed: 0,email,category
0,Congratulations! You've won a $1000 Walmart gi...,5
1,Join us for a special event this weekend!,4
2,Limited time offer: 50% off your next purchase!,3
3,"Hey, just checking in to see how you're doing.",2
4,Your bank statement is ready for review.,0


In [10]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['email'].values, df['category'].values, test_size=0.2)
     

In [11]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)
     

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
import torch

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"

from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification, EarlyStoppingCallback, Trainer, TrainingArguments
import numpy as np

# Load the BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[],
    push_to_hub=False
)

# Define a function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Pass the compute_metrics function
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

# Fine-tune the model
trainer.train()

# Evaluate the model to get the accuracy on the validation set
eval_results = trainer.evaluate()
print(f"Validation accuracy: {eval_results['eval_accuracy']}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.483,0.437371,0.860233
2,0.2858,0.383282,0.878536
3,0.1482,0.516834,0.890183
4,0.222,0.647099,0.860233
5,0.0898,0.76444,0.868552
6,0.0027,0.793338,0.876872
7,0.0473,0.780334,0.888519


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Validation accuracy: 0.8785357737104825


In [14]:
# Evaluate the model on the training set to get training accuracy
train_results = trainer.evaluate(eval_dataset=train_dataset)
print(f"Training accuracy: {train_results['eval_accuracy']}")

# Evaluate the model on the validation set
val_results = trainer.evaluate()
print(f"Validation accuracy: {val_results['eval_accuracy']}")

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Test accuracy: {test_results['eval_accuracy']}")

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Training accuracy: 0.9671107410491258


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Validation accuracy: 0.8785357737104825


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Test accuracy: 0.8785357737104825


In [15]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

def classify_email(subject, text):
    email_content = subject + ' ' + text
    encoding = tokenizer(email_content, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # Move the input tensors to the same device as the model
    encoding = {key: val.to(device) for key, val in encoding.items()}

    # Forward pass to get the output logits
    output = model(**encoding)

    # Get the predicted class
    prediction = torch.argmax(output.logits, dim=1)

    return label_encoder.inverse_transform(prediction.detach().cpu().numpy())

subjects: list[str] = ["Free gift cards", "Hello Dear", "Your account has been compromised."]
texts: list[str] = ["You have won a free gift card. Click here to claim!", "I am stuck in Africa and I need your help.", "Kindly login and reclaim your account."]
predicted_classes = [classify_email(subject, text).item() for subject, text in zip(subjects, texts)]
print(predicted_classes)

['spam', 'personal', 'important']
