In [1]:
import os
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, precision_recall_curve

In [2]:
os.chdir('/Users/bachirzerroug/Documents/transformers-implementation')

In [3]:
from src.local_transformers import Encoder
from src.utils import EarlyStopping, CustomDataset, PreTrainedTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Import data
emails_raw = pd.read_csv('src/data/spam.csv')

In [5]:
emails_raw

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
# Look at some examples
print(emails_raw['Message'].iloc[12])
print(emails_raw['Message'].iloc[0])
print(emails_raw['Message'].iloc[55])
print(emails_raw['Message'].iloc[700])

URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Do you know what Mallika Sherawat did yesterday? Find out now @  &lt;URL&gt;
K..u also dont msg or reply to his msg..


In [7]:
# Get some basic stats
print(f'shape of the dataset: {emails_raw.shape}')

# Number of classes
nb_classes = emails_raw['Category'].nunique()
print(f'Number unique classes {nb_classes}')

# Proportion of classes
proportions_classes = emails_raw.groupby('Category').count()/len(emails_raw)
print(f'Proportions of classes: \n {proportions_classes}')
# Number of distinct emails
print(f'Number of distinct emails: {emails_raw.Message.nunique()}')


shape of the dataset: (5572, 2)
Number unique classes 2
Proportions of classes: 
            Message
Category          
ham       0.865937
spam      0.134063
Number of distinct emails: 5157


In [8]:
message_count = emails_raw.groupby('Message').count()
message_count.columns = ['nb_messages']
message_count = message_count[message_count['nb_messages']>1]

In [9]:
message_count.sort_values(by='nb_messages')

Unnamed: 0_level_0,nb_messages
Message,Unnamed: 1_level_1
"Mila, age23, blonde, new in UK. I look sex with UK guys. if u like fun with me. Text MTALK to 69866.18 . 30pp/txt 1st 5free. £1.50 increments. Help08718728876",2
Ok lor.,2
Ok thanx...,2
Ok then i will come to ur home after half an hour,2
PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires,2
...,...
"Say this slowly.? GOD,I LOVE YOU &amp; I NEED YOU,CLEAN MY HEART WITH YOUR BLOOD.Send this to Ten special people &amp; u c miracle tomorrow, do it,pls,pls do it...",4
"7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st ""Ur Lovely Friendship""... good morning dear",4
Ok...,10
I cant pick the phone right now. Pls send a message,12


We see that repeated emails are
1. emails with basic answers like 'OK'
2. emails we can send automatically when someone call us "Sorry, I'll call later"
3. emails considered as chain "Send this to Ten special people"
4. spams received by multiple accounts  
...  

Let's check if they belong to the same class at least

In [10]:
def check_all_msg_same_class(message):
    list_similar_emails = emails_raw[emails_raw['Message'] == message]['Category']
    class_email = list_similar_emails.iloc[0]
    return list_similar_emails.tolist() == [class_email]*len(list_similar_emails)

In [11]:
for message in message_count.index:
    if not check_all_msg_same_class(message):
        print('There are similar emails that have at least 2 different classes')

Distinct messages have the same class in the dataset.  
Which means that there is not incoherence in the dataset information carried by these emails.  
The decision concerning these emails depend on how the dataset is built.
1. If the dataset is a sample of real emails received by a representative group of persons, then we can keep them. Indeed by keeping them we give them more weight which is logical as they appear multiple times.
2. If these emails are selected, it doesn't make sense to give more weight to specific emails, except if there is a idea behind.  

As we don't have information, we suppose that these emails are a sample of real emails

# Transform dataframe to a pytorch Dataclass

## First transform the category into a binary category with 0 and 1

In [12]:
emails_raw['category_binary'] = [0 if msg == 'ham' else 1 for msg in emails_raw['Category']]
emails_raw['category_binary']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: category_binary, Length: 5572, dtype: int64

## Train Val Test Split

In [13]:
X_train_val, X_test, y_train_val, y_test = train_test_split(emails_raw['Message'], emails_raw['category_binary'], test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2)

## Tokenize messages

In [14]:
max_length = max([len(message) for message in X_train])
tokenizer = PreTrainedTokenizer(tokenizer_type="autotokenizer_best_best_cased",
                                max_length=max_length,
                                truncation=True,
                                padding="max_length"
                                )

X_train_tokenized = tokenizer.tokenize_messages(X_train.to_list())
X_val_tokenized = tokenizer.tokenize_messages(X_val.to_list())
X_test_tokenized = tokenizer.tokenize_messages(X_test.to_list())

vocab_size = tokenizer.tokenizer.vocab_size

## Create dataset

In [86]:
dataset_train = CustomDataset(X_train_tokenized, torch.tensor(y_train.values, dtype=float))
dataset_val = CustomDataset(X_val_tokenized, torch.tensor(y_val.values, dtype=float))
dataset_test = CustomDataset(X_test_tokenized, torch.tensor(y_test.values, dtype=float))

## Create model

In [57]:
class SpamClassifier(nn.Module):
    def __init__(self,
                 embedding_dim,
                 latent_dim,
                 vocab_size, 
                 num_heads, 
                 num_layers, 
                 factor):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.latent_dim = latent_dim
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.factor = factor

        self.encoder = Encoder(vocab_size=self.vocab_size, 
                          embedding_dim=self.embedding_dim, 
                          num_heads=self.num_heads, 
                          num_layers=self.num_layers, 
                          factor=self.factor)

        self.linear = nn.Sequential(
            nn.Linear(self.embedding_dim, self.latent_dim),
            nn.ReLU(),
            nn.Linear(self.latent_dim, 1),
            nn.Sigmoid()
        )

        #self.simoid = nn.Sigmoid()

    def forward(self, x):
        x = self.encoder(x)
        x = torch.mean(x, dim=1)
        #x = self.linear(x)
        #return self.sigmoid(x)
        return self.linear(x)



In [58]:
# Instanciate basic paramters
learning_rate = 0.001
model = SpamClassifier(embedding_dim=256,
                       latent_dim=128,
                       vocab_size=vocab_size, 
                       num_heads=4, 
                       num_layers=3, 
                       factor=2
                    )

loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
nb_epochs = 10
batch_size = 64

In [59]:
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

In [76]:
pred = torch.tensor([[0.5191],
        [0.5207],
        [0.5169],
        [0.5206],
        [0.5192],
        [0.5165],
        [0.5198],
        [0.5207],
        [0.5218],
        [0.5227],
        [0.5184]])

In [84]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        pred = torch.tensor([int(elem[0]) for elem in pred])
        loss = loss_fn(pred, y)

        # Backpropagation
        # Backpropagate the prediction loss with a call to loss.backward(). 
        # PyTorch deposits the gradients of the loss w.r.t. each parameter.
        loss.backward()
        # Once we have our gradients, we call optimizer.step() to adjust the parameters 
        # by the gradients collected in the backward pass.
        optimizer.step()
        # Call optimizer.zero_grad() to reset the gradients of model parameters. 
        # Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
        optimizer.zero_grad()

        if batch % 100 == 0:
            print(loss)

In [85]:
train_loop(train_loader, model, loss_fn, optimizer)

RuntimeError: "binary_cross_entropy" not implemented for 'Long'

In [80]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    list_prauc = []
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

src.local_transformers.transcoders.encoder.Encoder

In [75]:
encoder = Encoder(embedding_dim=256,
                        vocab_size=vocab_size, 
                        num_heads=4, 
                        num_layers=3, 
                        factor=2)

linear = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )


In [77]:
X = torch.randint(low=0, high=100, size=(10, 100))
X = encoder(X)
X = torch.mean(X, dim=1)
X = linear(X)
#X = nn.Sigmoid()(X)

In [78]:
X.shape

torch.Size([10, 1])

In [79]:
X

tensor([[0.4911],
        [0.5101],
        [0.5109],
        [0.5013],
        [0.4948],
        [0.5063],
        [0.5029],
        [0.4885],
        [0.5022],
        [0.4862]], grad_fn=<SigmoidBackward0>)