In [1]:
#!pip install torchtext

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
pd.set_option('max_colwidth', 400)
df = pd.read_csv('datasets/ham-spam/spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [4]:
df = df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [5]:
df = df.rename(index = str, columns = {'v1': 'labels', 'v2': 'text'})
df.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [6]:
df.count()

labels    5572
text      5572
dtype: int64

In [7]:
train, test = train_test_split(df, test_size = 0.2, random_state = 42)
train.reset_index(drop=True), test.reset_index(drop=True)

(     labels  \
 0       ham   
 1      spam   
 2       ham   
 3       ham   
 4      spam   
 ...     ...   
 4452    ham   
 4453    ham   
 4454    ham   
 4455    ham   
 4456    ham   
 
                                                                                                                                                       text  
 0                                                                      No I'm in the same boat. Still here at my moms. Check me out on yo. I'm half naked.  
 1                (Bank of Granite issues Strong-Buy) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300% *********** Nasdaq Symbol CDGT That is a $5.00 per..  
 2                                                                                                           They r giving a second chance to rahul dengra.  
 3                                                                                                           O i played smash bros  &lt;#&gt;  religiously.  
 4     PRIVATE! 

In [8]:
train.shape, test.shape

((4457, 2), (1115, 2))

In [9]:
train.to_csv('datasets/ham-spam/train.csv', index=False)
test.to_csv('datasets/ham-spam/test.csv', index=False)

In [10]:
import numpy as np

import torch
import torchtext
from torchtext.legacy.data import Field, BucketIterator, TabularDataset

import nltk
# nltk.download('punkt')

from nltk import word_tokenize

In [11]:
text = torchtext.legacy.data.Field(tokenize = word_tokenize)
label = torchtext.legacy.data.LabelField(dtype = torch.float)
datafields = [("labels", label), ("text", text)]

In [12]:
trn, tst = torchtext.legacy.data.TabularDataset.splits(path = './datasets/ham-spam', train = 'train.csv',
test = 'test.csv', format = 'csv', skip_header = True, fields = datafields)

In [13]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

In [14]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [15]:
trn[4].labels

'spam'

In [16]:
text.build_vocab(trn, max_size = 10500)
label.build_vocab(trn)

In [17]:
print(f"Unique tokens in TEXT vocabulary: {len(text.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(label.vocab)}")

Unique tokens in TEXT vocabulary: 10207
Unique tokens in LABEL vocabulary: 2


In [18]:
print(text.vocab.freqs.most_common(50))

[('.', 3862), ('to', 1750), ('I', 1574), (',', 1468), ('you', 1462), ('?', 1256), ('!', 1134), ('a', 1068), ('the', 946), ('...', 923), ('&', 772), ('i', 760), ('and', 673), ('in', 663), ('is', 647), (';', 641), ('u', 636), ('me', 600), (':', 570), ('..', 544), ('for', 527), ('my', 494), ('of', 471), ('it', 470), ('your', 461), ('have', 395), ('on', 394), (')', 393), ('2', 390), ('that', 385), ("'s", 384), ('now', 321), ("'m", 320), ('are', 316), ('do', 312), ('call', 307), ('at', 301), ('U', 300), ('or', 298), ('not', 295), ("n't", 281), ('be', 275), ('*', 270), ('lt', 267), ('gt', 267), ('with', 267), ('get', 265), ('will', 264), ('so', 257), ('#', 245)]


In [19]:
print(text.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [20]:
batch_size = 64
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits((trn, tst), batch_size = batch_size,
                                sort_key = lambda x: len(x.text), sort_within_batch = False)

In [21]:
import torch.nn as nn

In [22]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden_1D = hidden.squeeze(0)
        assert torch.equal(output[-1, :, :], hidden_1D)
        return self.fc(hidden_1D)

In [23]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden_1D = hidden.squeeze(0)
        assert torch.equal(output[-1, :, :], hidden_1D)
        return self.fc(hidden_1D)

In [24]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden_1D = hidden.squeeze(0)
        assert torch.equal(output[-1, :, :], hidden_1D)
        return self.fc(hidden_1D)

In [25]:
input_dim = len(text.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1

In [26]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [27]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = 1e-6)

In [28]:
criterion = nn.BCEWithLogitsLoss()

In [29]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float() 
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
num_epochs = 5

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')

| Epoch: 01 | Train Loss: 0.630 | Train Acc: 86.10% 
| Epoch: 02 | Train Loss: 0.604 | Train Acc: 86.10% 
| Epoch: 03 | Train Loss: 0.582 | Train Acc: 86.04% 
| Epoch: 04 | Train Loss: 0.561 | Train Acc: 86.10% 
| Epoch: 05 | Train Loss: 0.542 | Train Acc: 86.14% 


In [31]:
epoch_loss = 0
epoch_acc = 0

In [32]:
model.eval()

RNN(
  (embedding): Embedding(10207, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [33]:
with torch.no_grad():
    for batch in test_iterator:
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float() 
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.602 | Test Acc: 74.74% |
