#### In this demo we will build a machine learning model to classify sms texts as ham or spam

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### SMS Spam Collection Dataset
Source: https://www.kaggle.com/uciml/sms-spam-collection-dataset


The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.

In [58]:
data = pd.read_csv('datasets/spam.csv', encoding='latin-1')
data = data.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
data = data.rename(index = str, columns = {'v1': 'labels', 'v2': 'text'})
data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Cleaning Data

In [4]:


data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [6]:
train.reset_index(drop=True), test.reset_index(drop=True)

(     labels                                               text
 0       ham  No I'm in the same boat. Still here at my moms...
 1      spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
 2       ham     They r giving a second chance to rahul dengra.
 3       ham     O i played smash bros  &lt;#&gt;  religiously.
 4      spam  PRIVATE! Your 2003 Account Statement for 07973...
 ...     ...                                                ...
 4452    ham  I came hostel. I m going to sleep. Plz call me...
 4453    ham                             Sorry, I'll call later
 4454    ham      Prabha..i'm soryda..realy..frm heart i'm sory
 4455    ham                         Nt joking seriously i told
 4456    ham                In work now. Going have in few min.
 
 [4457 rows x 2 columns],
      labels                                               text
 0       ham  Funny fact Nobody teaches volcanoes 2 erupt, t...
 1       ham  I sent my scores to sophas and i had to do sec...
 2      spam

In [7]:
train.head()

Unnamed: 0,labels,text
1978,ham,No I'm in the same boat. Still here at my moms...
3989,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935,ham,They r giving a second chance to rahul dengra.
4078,ham,O i played smash bros &lt;#&gt; religiously.
4086,spam,PRIVATE! Your 2003 Account Statement for 07973...


In [8]:
test.head()

Unnamed: 0,labels,text
3245,ham,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
944,ham,I sent my scores to sophas and i had to do sec...
1044,spam,We know someone who you know that fancies you....
2484,ham,Only if you promise your getting out as SOON a...
812,spam,Congratulations ur awarded either å£500 of CD ...


In [9]:
train.shape, test.shape

((4457, 2), (1115, 2))

Saving Train and test data in csv files

In [10]:
train.to_csv('datasets/train.csv', index=False)
test.to_csv('datasets/test.csv', index=False)

In [11]:
!ls datasets

spam.csv  test.csv  train.csv


In [19]:
import numpy as np

import torch
import torchtext

#from torchtext.data import Field, BucketIterator, TabularDataset
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, TabularDataset, LabelField

#### NLTK provides a function called word_tokenize() for splitting strings into tokens (nominally words). It splits tokens based on white space and punctuation.

In [15]:
import nltk
nltk.download('punkt')

from nltk import word_tokenize

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#### The parameters of a Field specify how the data should be processed.We use the TEXT field to define how the text should be processed, and the LABEL field to process the labels.

In [17]:
TEXT = Field(tokenize = word_tokenize)

<torchtext.legacy.data.field.Field at 0x7f5f147e4748>

In [20]:
LABEL = LabelField(dtype = torch.float)

In [21]:
datafields = [("labels", LABEL), ("text", TEXT)]

#### the following code splits data into the canonical train/test splits as torchtext.datasets objects. It process the data using the Fields we have previously defined.

In [22]:
trn, tst = TabularDataset.splits(path = './datasets', 
                                                train = 'train.csv',
                                                test = 'test.csv' ,    
                                                format = 'csv',
                                                skip_header = True,
                                                fields = datafields)

In [23]:
trn[:5]

[<torchtext.legacy.data.example.Example at 0x7f5f147e4a20>,
 <torchtext.legacy.data.example.Example at 0x7f5f147e4b00>,
 <torchtext.legacy.data.example.Example at 0x7f5f147e4c18>,
 <torchtext.legacy.data.example.Example at 0x7f5f1465ab00>,
 <torchtext.legacy.data.example.Example at 0x7f5f1465acf8>]

#### We can see how many examples are in each split by checking their length.

In [24]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

Number of training examples: 4457
Number of testing examples: 1115


In [25]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

In [26]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [27]:
trn[5].labels

'ham'

#### We can also check an example.

In [28]:
print(vars(trn.examples[5]))

{'labels': 'ham', 'text': ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']}



#### Next, we have to build a vocabulary. This is a effectively a look up table where every unique word in your data set has a corresponding index (an integer). Each index is used to construct a one-hot vector for each word.
There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear less than $m$ times. We'll do the former, only keeping the top 10,500 words.
The words that appear in examples but we have cut from the are replaced  with a special unknown  token.

In [29]:
TEXT.build_vocab(trn, max_size = 10500)

In [30]:
LABEL.build_vocab(trn)

The vocab size is 10502 because, one of the addition tokens is the unk token and the other is a pad token.

In [31]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10207
Unique tokens in LABEL vocabulary: 2


#### We can also view the most common words in the vocabulary and their frequencies.

In [32]:
print(TEXT.vocab.freqs.most_common(50))

[('.', 3862), ('to', 1750), ('I', 1574), (',', 1468), ('you', 1462), ('?', 1256), ('!', 1134), ('a', 1068), ('the', 946), ('...', 923), ('&', 772), ('i', 760), ('and', 673), ('in', 663), ('is', 647), (';', 641), ('u', 636), ('me', 600), (':', 570), ('..', 544), ('for', 527), ('my', 494), ('of', 471), ('it', 470), ('your', 461), ('have', 395), ('on', 394), (')', 393), ('2', 390), ('that', 385), ("'s", 384), ('now', 321), ("'m", 320), ('are', 316), ('do', 312), ('call', 307), ('at', 301), ('U', 300), ('or', 298), ('not', 295), ("n't", 281), ('be', 275), ('*', 270), ('lt', 267), ('gt', 267), ('with', 267), ('get', 265), ('will', 264), ('so', 257), ('#', 245)]


#### We can also see the vocabulary directly using either the stoi (string to int) or itos (int to string) method.

In [33]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [34]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


Now, we will create iterators that will iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.
#### We'll use a BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

In [36]:
batch_size = 64

train_iterator, test_iterator = BucketIterator.splits(
   (trn, tst),
    batch_size = batch_size,
    sort_key = lambda x: len(x.text), 
    sort_within_batch = False)

#### Build The Model

- <b>The embedding layer</b> is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector
- The RNN layer is our RNN which takes in our dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$
- Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.


The RNN returns 2 tensors, output of size [sentence length, batch size, hidden dim] and hidden of size [1, batch size, hidden dim]. output is the concatenation of the hidden state from every time step, whereas hidden is simply the final hidden state. We verify this using the assert statement. Note the squeeze method, which is used to remove a dimension of size 1. Finally, we feed the last hidden state, hidden, through the linear layer, fc, to produce a prediction.

In [37]:
import torch.nn as nn

In [38]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
    
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

In [39]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, (hidden, _) = self.rnn(embedded)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

In [40]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        embedded_dropout = self.dropout(embedded)
        
        output, (hidden, _) = self.rnn(embedded_dropout)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

#### We now create an instance of our RNN class.

- The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size.
- The embedding dimension is the size of the dense word vectors.
- The hidden dimension is the size of the hidden states
- The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [41]:
input_dim = len(TEXT.vocab)

embedding_dim = 100

hidden_dim = 256

output_dim = 1

In [42]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [43]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 1e-6)

#### we will use BCEWithLogitsLoss loss as our loss function - Creates a criterion that measures the Binary Cross Entropy between the target and the output
This loss combines a Sigmoid layer and the BCELoss in one single class.

In [44]:
criterion = nn.BCEWithLogitsLoss()

#### Training
- For each batch, we first zero the gradients. Each parameter in a model has a grad attribute which stores the gradient calculated by the criterion.
- We then feed the batch of sentences, batch.text, into the model
- The loss and accuracy are then calculated using our predictions and the labels, batch.labels, with the loss being averaged over all examples in the batch.
- We calculate the gradient of each parameter and then update the parameters using the gradients and optimizer algorithm
- Finally, we return the loss and accuracy

##### Calculating Accuracy 
We first feeds the predictions through a sigmoid layer, squashing the values between 0 and 1, we then round them to the nearest integer. This rounds any value greater than 0.5 to 1 (spam) and the rest to 0 (ham).

We then calculate how many rounded predictions equal the actual labels and average it across the batch.

In [45]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float() 
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### the loss is decreasing with each epoch and we get a final accuracy of ~85%

In [46]:
num_epochs = 5

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')

| Epoch: 01 | Train Loss: 0.677 | Train Acc: 79.30% 
| Epoch: 02 | Train Loss: 0.663 | Train Acc: 85.17% 
| Epoch: 03 | Train Loss: 0.650 | Train Acc: 85.74% 
| Epoch: 04 | Train Loss: 0.638 | Train Acc: 85.67% 
| Epoch: 05 | Train Loss: 0.626 | Train Acc: 85.98% 


evaluate is similar to train, with a few modifications as you don't want to update the parameters when evaluating.

In [47]:
epoch_loss = 0
epoch_acc = 0

In [48]:
model.eval()

RNN(
  (embedding): Embedding(10207, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [56]:
batch.text

tensor([[ 389,  119,  179,  ..., 1980,  362,  140],
        [   0,   71,  578,  ...,  649,   19,  220],
        [   2,   57,   28,  ...,   24,   10,  252],
        ...,
        [   1,    1,    1,  ...,    1,    1,   10],
        [   1,    1,    1,  ...,    1,    1,  627],
        [   1,    1,    1,  ...,    1,    1,    2]])

In [49]:
with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_preds == batch.labels).float() 
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.662 | Test Acc: 75.32% |


In [51]:
output_dir = 'model'
torch.save(model, os.path.join(output_dir, 'model.pt'))
#torch.save(model.state_dict(), PATH)

In [55]:
model = torch.load('./model/model.pt')
model.eval()

RNN(
  (embedding): Embedding(10207, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [None]:
#input_data = torch.tensor(json.loads(input_data)['data'])

# get prediction
with torch.no_grad():
    output = model(input_data)
    classes = ['chicken', 'turkey']
    softmax = nn.Softmax(dim=1)
    pred_probs = softmax(output).numpy()[0]
    index = torch.argmax(output, 1)

    result = {"label": classes[index], "probability": str(pred_probs[index])}
    return result