<font size=7>**Binary Text Classification**</font>

<font size=6>**Splitting and Prepping data**</font>

In [1]:
import torchtext

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

In [3]:
data = pd.read_csv(r'/media/abx05/DATA/datasets/nlpwithpytorch/datasets/ham-spam/spam.csv',encoding='latin1')
data.head(7)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,


In [4]:
data = data.drop(data.iloc[:,2:5].columns, axis=1)

In [5]:
data.head(7)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...


In [6]:
data = data.rename(index=str,columns={'v1':'labels', 'v2':'text'})
data.head(7)

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...


In [7]:
train,test = train_test_split(data,test_size=0.2,random_state=37)
Counter(train['labels']),Counter(test['labels'])

(Counter({'ham': 3860, 'spam': 597}), Counter({'ham': 965, 'spam': 150}))

In [8]:
train.reset_index(drop=True,inplace=True),test.reset_index(drop=True,inplace=True)
train.head(7)

Unnamed: 0,labels,text
0,ham,Aight well keep me informed
1,ham,Hi. Happy New Year. I dont mean to intrude but...
2,ham,Do u konw waht is rael FRIENDSHIP Im gving yuo...
3,ham,Sorry i din lock my keypad.
4,spam,Not heard from U4 a while. Call 4 rude chat pr...
5,ham,Short But Cute : \ Be a good person
6,spam,+123 Congratulations - in this week's competit...


In [9]:
train.shape,test.shape

((4457, 2), (1115, 2))

In [10]:
#save these files to be used again in the future

<font size=6>**Preprocessing**</font>

In [11]:
import numpy as np

In [12]:
import torch
import torchtext
from torchtext.legacy.data import Field,BucketIterator,TabularDataset,LabelField
from torchtext.data.utils import get_tokenizer

In [13]:
import nltk
from nltk import word_tokenize

In [14]:
TEXT = Field(tokenize=word_tokenize)

In [15]:
LABEL = LabelField(dtype=torch.float)

In [16]:
datafields = [('labels',LABEL),('text',TEXT)]

In [17]:
train,test = TabularDataset.splits(path = '/media/abx05/DATA/datasets/nlpwithpytorch/datasets/ham-spam',
                                                 train = 'train.csv',
                                                 test = 'test.csv',
                                                 format='csv',
                                                 skip_header=True,
                                                 fields = datafields)

In [18]:
train[:5]

[<torchtext.legacy.data.example.Example at 0x7f0a980c19a0>,
 <torchtext.legacy.data.example.Example at 0x7f0a94df6c70>,
 <torchtext.legacy.data.example.Example at 0x7f0a94df6c40>,
 <torchtext.legacy.data.example.Example at 0x7f0a992b3c10>,
 <torchtext.legacy.data.example.Example at 0x7f0a980c1df0>]

In [19]:
len(train),len(test)

(4457, 1115)

In [20]:
train[4].text

['PRIVATE',
 '!',
 'Your',
 '2003',
 'Account',
 'Statement',
 'for',
 '07973788240',
 'shows',
 '800',
 'un-redeemed',
 'S.',
 'I.',
 'M.',
 'points',
 '.',
 'Call',
 '08715203649',
 'Identifier',
 'Code',
 ':',
 '40533',
 'Expires',
 '31/10/04']

In [21]:
train[4].labels

'spam'

In [22]:
TEXT.build_vocab(train,max_size=10500)

In [23]:
LABEL.build_vocab(test)

In [24]:
len(TEXT.vocab),len(LABEL.vocab)

(10207, 2)

In [25]:
TEXT.vocab.freqs.most_common(50)

[('.', 3862),
 ('to', 1750),
 ('I', 1574),
 (',', 1468),
 ('you', 1462),
 ('?', 1256),
 ('!', 1134),
 ('a', 1068),
 ('the', 946),
 ('...', 923),
 ('&', 772),
 ('i', 760),
 ('and', 673),
 ('in', 663),
 ('is', 647),
 (';', 641),
 ('u', 636),
 ('me', 600),
 (':', 570),
 ('..', 544),
 ('for', 527),
 ('my', 494),
 ('of', 471),
 ('it', 470),
 ('your', 461),
 ('have', 395),
 ('on', 394),
 (')', 393),
 ('2', 390),
 ('that', 385),
 ("'s", 384),
 ('now', 321),
 ("'m", 320),
 ('are', 316),
 ('do', 312),
 ('call', 307),
 ('at', 301),
 ('U', 300),
 ('or', 298),
 ('not', 295),
 ("n't", 281),
 ('be', 275),
 ('*', 270),
 ('lt', 267),
 ('gt', 267),
 ('with', 267),
 ('get', 265),
 ('will', 264),
 ('so', 257),
 ('#', 245)]

In [26]:
LABEL.vocab.stoi

defaultdict(None, {'ham': 0, 'spam': 1})

In [27]:
bs = 64

train_iterator,test_iterator = BucketIterator.splits(
    (train,test),
    batch_size = bs,
    sort_key = lambda x: len(x.text),
    sort_within_batch = False)

<font size=6>**RNN Model**</font>

In [54]:
import torch.nn as nn
import torch.optim as optim

In [138]:
class RNN(nn.Module):
    def __init__(self,input_size,embed_size,hidden_size,output_size):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding(input_size,embed_size)
        self.rnn = nn.LSTM(embed_size,hidden_size)
        self.fc = nn.Linear(hidden_size,output_size)
        self.dropout = nn.Dropout(0.25)
        
    def forward(self,input_text):
        embedded = self.embedding(input_text)
        embedded_dropout = self.dropout(embedded)
        output,(hidden,_) = self.rnn(embedded_dropout)
        hidden = hidden.squeeze(0)
        assert torch.equal(output[-1,:,:],hidden)
        return self.fc(hidden)
        

In [139]:
input_size = len(TEXT.vocab)
embed_size = 100
hidden_size = 256
output_size = 1

In [140]:
model = RNN(input_size,embed_size,hidden_size,output_size)

In [151]:
optimizer = optim.Adam(model.parameters(),lr=1e-6)

In [152]:
criterion = nn.BCEWithLogitsLoss()

In [153]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.labels)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float() 
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [178]:
epochs = 6

for epoch in range(epochs):
    train_loss,train_acc = train(model,train_iterator,optimizer,loss)
    print(epoch,train_loss,train_acc*100)


0 0.40441112262862067 85.33645468098777
1 0.4045392717633929 85.33645468098777
2 0.40229017181055887 85.36149825368608
3 0.40388022852795463 85.5123039654323
4 0.40205465384892053 85.56674650737217
5 0.3999584395970617 85.58743468352726


In [179]:
epoch_loss = 0
epoch_acc = 0

In [180]:
model.eval()

RNN(
  (embedding): Embedding(10207, 100)
  (rnn): LSTM(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [181]:
with torch.no_grad():
    for batch in test_iterator:
        predictions = model(batch.text).squeeze(1)
        loss_value = loss(predictions,batch.labels)
        rounded_preds = torch.round(torch.sigmoid(predictions)).float()
        correct_preds = (rounded_preds==batch.labels)
        acc = correct_preds.sum() / len(correct_preds)
        
        epoch_loss += loss_value.item()
        epoch_acc += acc.item()
        
test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc  / len(test_iterator)

test_loss,test_acc

(0.5939175056086646, 0.7010673880577087)