In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns

import time
from copy import deepcopy

In [None]:
df = pd.read_csv(r"/kaggle/input/email-spam-classification/email_spam.csv")

In [None]:
df.shape

In [None]:
df['type'].value_counts()

In [None]:
mp = {'spam': 1,
     'not spam': 0}
dp = {1: 'spam',
     0: 'not spam'}

In [None]:
df['type'] = df['type'].map(mp)
df['full'] = df['title'] + ". " + df['text']
dt = df[['type', 'full']]

In [None]:
train, test = train_test_split(dt.values, random_state=42, test_size=0.1)

In [None]:
tokenizer = get_tokenizer("basic_english")

def yield_tokenizer(x):
    for _, text in x:
        yield tokenizer(text)
        
        
vocab = build_vocab_from_iterator(yield_tokenizer(train), specials=["<unk>"])
vocab.set_default_index(vocab['<unk>'])

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))

In [None]:
def collate_batch(batch):
    text_list, labels_list, offsets = [], [], [0]
    for label, text in batch:
        process = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list += [process]
        labels_list += [label]
        offsets += [process.size(0)]
        
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    labels_list = torch.tensor(labels_list, dtype=torch.int64)
    text_list = torch.cat(text_list)
    return text_list, labels_list, offsets

In [None]:
BATCH = 4
EPOCHS = 20
NUM_CLASSES = 2
VOCAB_SIZE = len(vocab)
EMSIZE = 64

In [None]:
train_dl = DataLoader(train, batch_size=BATCH, shuffle=True, collate_fn=collate_batch)
test_dl = DataLoader(test, batch_size=BATCH, shuffle=False, collate_fn=collate_batch)

In [None]:
class Block(torch.nn.Module):
    def __init__(self, _in, _out):
        super(Block, self).__init__()
        self.layer1 = torch.nn.Linear(_in, _out)
        self.layer2 = torch.nn.Dropout(p=0.4)
        self.layer3 = torch.nn.BatchNorm1d(_out)
        
    def forward(self, x):
        return self.layer3(self.layer2(self.layer1(x)))

class SpamClassifier(torch.nn.Module):
    def __init__(self, vocab_size, emsize, num_classes):
        super(SpamClassifier, self).__init__()
        self.embed = torch.nn.EmbeddingBag(vocab_size, emsize, sparse=False)
        self.layers = torch.nn.Sequential(Block(emsize, 128),
                                         Block(128, 256),
                                         Block(256, 256),
                                         Block(256, 128),
                                         Block(128, 64))
        self.fc = torch.nn.Linear(64, num_classes)
        
        
    def forward(self, x, offset):
        x = self.embed(x, offset)
        x = self.layers(x)
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SpamClassifier(VOCAB_SIZE, EMSIZE, NUM_CLASSES)
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [None]:
best_model = deepcopy(model)
best_acc = 0
train_history = []
val_history = []
start = time.time()

for i in range(1, EPOCHS+1):
    start1 = time.time()
    train_loss = 0
    train_total = 0
    model.train()
    for text, label, offset in train_dl:
        optimizer.zero_grad()
        if torch.cuda.is_available():
            text, label, offset = text.cuda(), label.cuda(), offset.cuda()
        
        out = model(text, offset)
        loss = criterion(out, label)
        train_loss += loss.item()
        train_total += out.size(0)
        loss.backward()
        optimizer.step()
        
    train_con = train_loss/train_total
    model.eval()
    total_acc = 0
    acc_total = 0
    for text, label, offset in test_dl:
        if torch.cuda.is_available():
            text, label, offset = text.cuda(), label.cuda(), offset.cuda()
        out = model(text, offset)
        total_acc += (out.argmax(1) == label).sum().item()
        acc_total += out.size(0)
    acc_con = total_acc/acc_total
    if acc_con > best_acc:
        best_model = deepcopy(model)
        best_acc = acc_con
        
    train_history += [train_con]
    val_history += [acc_con]
    
    end1 = time.time()
    print("Epoch {} || train loss: {} || accuracy: {} || time: {}".format(i,
                                                                         train_con,
                                                                         acc_con, end1-start1))
end = time.time()
print("Total time: {}".format(end-start))

In [None]:
best_acc

In [None]:
epochs = list(range(1, EPOCHS+1))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
axes[0].plot(epochs, train_history)
axes[1].plot(epochs, val_history)
axes[0].set_title("Train loss progression")
axes[1].set_title("Accuracy history")
plt.show()

In [None]:
def predict(x):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(x))
        offset = 0
        if torch.cuda.is_available():
            text, offset = text.cuda(), torch.tensor([0]).cuda()
        out = best_model(text, offset)
        return out.argmax(1).item()

In [None]:
test[0, -1]

In [None]:
dp[predict(test[0, -1])]