In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('/content/final_combined.csv')

In [5]:
df

Unnamed: 0,email,category
0,Congratulations! You've won a $1000 Walmart gi...,spam
1,Join us for a special event this weekend!,social
2,Limited time offer: 50% off your next purchase!,promotional
3,"Hey, just checking in to see how you're doing.",personal
4,Your bank statement is ready for review.,finance
...,...,...
2998,Please confirm your email address to continue ...,important
2999,Your scheduled payment is due in 2 days. Check...,important
3000,Your account requires immediate attention. Ple...,important
3001,Your recent purchase receipt is available. Che...,finance


In [6]:
def remove_subject_prefix(text):
  if text.lower().startswith('subject:'):
    return text[8:].lstrip()
  return text

df['email'] = df['email'].apply(remove_subject_prefix)

In [7]:
df['email'] = df['email'].apply(lambda x: x.strip())
df['category'] = df['category'].apply(lambda x: x.strip())
df['category'].unique()

array(['spam', 'social', 'promotional', 'personal', 'finance',
       'important'], dtype=object)

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])
df.head()

Unnamed: 0,email,category
0,Congratulations! You've won a $1000 Walmart gi...,5
1,Join us for a special event this weekend!,4
2,Limited time offer: 50% off your next purchase!,3
3,"Hey, just checking in to see how you're doing.",2
4,Your bank statement is ready for review.,0


In [9]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(df['email'].values, df['category'].values, test_size=0.2)

In [10]:
train_texts, train_labels

(array(["Instagram: Your photo was used in a brand's campaign. See the feature!",
        'Earn $500 a week working from home! Sign up today!',
        'Account security update: password reset required.', ...,
        'Download our new app update.',
        'Can you recommend a good plumber? Our sink is leaking again.',
        'A new bill has arrived in your account. View it now to avoid late fees.'],
       dtype=object),
 array([4, 5, 1, ..., 1, 2, 0]))

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.tolist(), truncation = True, padding = True, max_length = 128)
val_encodings = tokenizer(val_texts.tolist(), truncation = True, padding = True, max_length = 128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [12]:
import torch
from torch import nn
import math

In [13]:
from torch.utils.data import DataLoader, Dataset

class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [14]:
train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [15]:
# The SinusoidalPosEmb class generates sinusoidal position embeddings
from torch import nn

class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim):
        super(SinusoidalPosEmb, self).__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb


In [16]:
# This class defines a single layer of the architecture
class TransformerBlock(nn.Module):
    def __init__(self, hidden_size=128, num_heads=4):
        super(TransformerBlock, self).__init__()
        self.norm1 = nn.LayerNorm(hidden_size)
        self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads=num_heads, batch_first=True, dropout=0.25)
        self.norm2 = nn.LayerNorm(hidden_size)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size)
        )

    def forward(self, x, key_padding_mask):
        norm_x = self.norm1(x)
        attn_output = self.multihead_attn(norm_x, norm_x, norm_x, key_padding_mask=key_padding_mask)[0]
        x = attn_output + x
        norm_x = self.norm2(x)
        mlp_output = self.mlp(norm_x)
        output = mlp_output + x
        return output


In [17]:
class CustomTransformer(nn.Module):
    def __init__(self, num_emb, output_size, hidden_size=128, num_layers=3, num_heads=4):
        super(CustomTransformer, self).__init__()
        self.embedding = nn.Embedding(num_emb, hidden_size)
        self.pos_emb = SinusoidalPosEmb(hidden_size)
        self.blocks = nn.ModuleList([TransformerBlock(hidden_size, num_heads) for _ in range(num_layers)])
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, input_ids, attention_mask):
        bs, seq_len = input_ids.shape
        key_padding_mask = attention_mask == 0
        input_embs = self.embedding(input_ids)
        seq_index = torch.arange(seq_len, device=input_ids.device)
        pos_emb = self.pos_emb(seq_index).unsqueeze(0).expand(bs, seq_len, -1)
        embs = input_embs + pos_emb
        for block in self.blocks:
            embs = block(embs, key_padding_mask)
        output = self.fc_out(embs[:, 0])
        return output


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model
model = CustomTransformer(num_emb=tokenizer.vocab_size, output_size=6, hidden_size=128, num_layers=3, num_heads=4)
model.to(device)

CustomTransformer(
  (embedding): Embedding(30522, 128)
  (pos_emb): SinusoidalPosEmb()
  (blocks): ModuleList(
    (0-2): 3 x TransformerBlock(
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=128, bias=True)
        (1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (2): ELU(alpha=1.0)
        (3): Linear(in_features=128, out_features=128, bias=True)
      )
    )
  )
  (fc_out): Linear(in_features=128, out_features=6, bias=True)
)

In [19]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()




In [27]:
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    # Training loop
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Training accuracy calculation
        _, predicted_train = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted_train == labels).sum().item()

    train_accuracy = correct_train / total_train

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Training Accuracy: {train_accuracy}')

    # Validation loop
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted_val = torch.max(outputs, 1)
            total_val += labels.size(0)
            correct_val += (predicted_val == labels).sum().item()

    val_accuracy = correct_val / total_val

    print(f'Validation Accuracy: {val_accuracy}')


Epoch 1, Loss: 0.3505373816715171, Training Accuracy: 0.8871773522064946
Validation Accuracy: 0.8352745424292846
Epoch 2, Loss: 0.32461523977632556, Training Accuracy: 0.892173189009159
Validation Accuracy: 0.8369384359400999
Epoch 3, Loss: 0.2946728921883943, Training Accuracy: 0.906744379683597
Validation Accuracy: 0.8419301164725458
Epoch 4, Loss: 0.2758365254864615, Training Accuracy: 0.9129891756869276
Validation Accuracy: 0.8535773710482529
Epoch 5, Loss: 0.26612352524322785, Training Accuracy: 0.9125728559533722
Validation Accuracy: 0.8386023294509152
Epoch 6, Loss: 0.24785909103538026, Training Accuracy: 0.9192339716902581
Validation Accuracy: 0.8452579034941764
Epoch 7, Loss: 0.2277473072268513, Training Accuracy: 0.9242298084929226
Validation Accuracy: 0.8469217970049917
Epoch 8, Loss: 0.22075538304541878, Training Accuracy: 0.921315570358035
Validation Accuracy: 0.8502495840266223
Epoch 9, Loss: 0.19978839438408613, Training Accuracy: 0.9333888426311407
Validation Accuracy: 

In [37]:
'''
0 - finance
1 - spam
2 - personalal
3 - promotion
4 - social
5 - spam
'''

import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_labels = train_labels
# print(np.unique(train_labels))  # Check unique training labels

label_encoder = LabelEncoder()
label_encoder.fit(np.arange(6))

# Move to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

def classify_email(subject, text):
    email_content = subject + ' ' + text
    encoding = tokenizer(email_content, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # Move the input tensors to the same device as the model
    encoding = {key: val.to(device) for key, val in encoding.items()}

    # Remove token_type_ids if not needed
    if 'token_type_ids' in encoding:
        del encoding['token_type_ids']

    # Forward pass to get the output logits
    output = model(**encoding)

    # Get the predicted class by taking the argmax of the output tensor directly
    prediction = torch.argmax(output, dim=1)

    return label_encoder.inverse_transform(prediction.detach().cpu().numpy())

# Testing on toy examples
subjects = ["Free gift cards", "Hello Dear", "Your account has been compromised.",
            "Thank you for Participating!", "You have 23 notifications about Vidhi and others",
            "Exclusive Discount Just for You!", "Congratulations! You've won a lottery!",
            "Urgent: Team Meeting Tomorrow"]
texts = ["You have won a free gift card. Click here to claim!",
         "I am stuck in Africa and I need your help.",
         "Kindly login and reclaim your account.",
         "Thank you for your participation in Goldman Sachs Hackathon event! We were impressed with the creativity and dedication shown by you.",
         "You have 23 unread notifications to review. A lot has happened on Facebook since you last logged in. Here are some notifications you've missed from your friends.",
         "Get 20% off your next purchase! Use code: SAVE20.",
         "Click here to claim your prize of $1,000,000!",
         "Dear Team, please be informed that we have an urgent meeting scheduled for tomorrow at 10 AM to discuss the project deadline. Your attendance is mandatory."]
predicted_classes = [classify_email(subject, text).item() for subject, text in zip(subjects, texts)]
print(predicted_classes)


[5, 2, 1, 4, 4, 3, 5, 1]


