<a href="https://colab.research.google.com/github/aakashpaul-2/NLP/blob/main/CNN_NLP_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
''' 
This project uses CNN and PYTORCH to create a ham/spam detector using NLP Embeddings

'''

In [None]:
# imports
import torchtext.data as ttd
import torch
import torch.nn as nn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import torch.nn.functional as F

In [None]:
# import dataset
!wget -nc https://lazyprogrammer.me/course_files/spam.csv

--2020-05-25 14:49:07--  https://lazyprogrammer.me/course_files/spam.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.31.80.48, 104.31.81.48, 2606:4700:3037::681f:5030, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.31.80.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [text/csv]
Saving to: ‘spam.csv’


2020-05-25 14:49:08 (34.1 MB/s) - ‘spam.csv’ saved [503663/503663]



In [None]:
# store as dataframe
df = pd.read_csv("spam.csv", encoding = "ISO-8859-1")

In [None]:
# view df
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
# drop unused column and rename columns
df = df.drop(["Unnamed: 2","Unnamed: 3", "Unnamed: 4"], axis=1 )

In [None]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.columns = ["labels", "data"]

In [None]:
# mapping lables to catogorical values
df["b_labels"] = df["labels"].map({"ham":0, "spam":1})

In [None]:
df2 = df[["data", "b_labels"]]

In [None]:
df2.head()

Unnamed: 0,data,b_labels
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# converting df to csv 
df2.to_csv("spam2.csv", index= False)

In [None]:
# Creating filed objects for pytorch model
# TEXT - input data (for sequence of data, lower case, N*D*T, pre padding)
TEXT = ttd.Field(sequential=True, lower=True, batch_first=True,pad_first=True,tokenize="spacy")

# Setting the label as the targets into the LABEL object (numerical labels)
LABEL = ttd.Field(sequential=False, use_vocab=False, is_target=True)

In [None]:
# assiging the csv (for torch text tabular dataset) and labels to the TEXT and LABEL objects created to tokenise
dataset = ttd.TabularDataset(path="spam2.csv",format="csv",skip_header=True,fields=[("data",TEXT),("b_labels",LABEL)])

In [None]:
# Splitting dataset into train and test
train_dataset, test_dataset = dataset.split()

In [None]:
# assigns unique interger to each token in dataset
TEXT.build_vocab(train_dataset)

In [None]:
# assigning vocab object
vocab = TEXT.vocab

In [None]:
# visualise the unique integer assigned to each token
vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'<unk>': 0,
             '<pad>': 1,
             '.': 2,
             'i': 3,
             'to': 4,
             'you': 5,
             ',': 6,
             'a': 7,
             '?': 8,
             'the': 9,
             '!': 10,
             '...': 11,
             'u': 12,
             'and': 13,
             'in': 14,
             'is': 15,
             'me': 16,
             'my': 17,
             'it': 18,
             'for': 19,
             'do': 20,
             '..': 21,
             'your': 22,
             'of': 23,
             'have': 24,
             'that': 25,
             'call': 26,
             'on': 27,
             'are': 28,
             '&': 29,
             "'s": 30,
             '2': 31,
             'now': 32,
             'not': 33,
             'but': 34,
             'so': 35,
             ' ': 36,
             'we': 37,
             'can': 38,
             'or': 39,
             'at'

In [None]:
# check if gpu is enabled
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


cuda:0


In [None]:
# batch gradient decent - sort key to ensure during padding each sentences in each batch to be the same size
train_iter, test_iter = ttd.Iterator.splits((train_dataset, test_dataset), sort_key=lambda x: len(x.data), batch_sizes=(32,256), device = device)

In [None]:
# checking batch shapes for input 
for inputs, targets in train_iter:
  print("inputs:", inputs, "shape", inputs.shape)
  print("targets:", targets, "shape", targets.shape)

inputs: tensor([[   1,    1,    1,  ...,   67,   19,  218],
        [   1,    1,    1,  ...,  312,   10,   85],
        [   1,    1,    1,  ...,    9,  263,    2],
        ...,
        [   1,    1,    1,  ..., 1413,  177,   78],
        [   1,    1,    1,  ...,   24,  482,    2],
        [   1,    1,    1,  ...,  550,  971,    8]], device='cuda:0') shape torch.Size([32, 46])
targets: tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0') shape torch.Size([32])
inputs: tensor([[   1,    1,    1,  ..., 1068,  181,    2],
        [   1,    1,    1,  ...,    3,  112,  246],
        [   1,    1,    1,  ..., 3477,   39,  206],
        ...,
        [   1,    1,    1,  ..., 6265,   61,   21],
        [   1,    1,    1,  ...,    2,   12,    8],
        [   1,    1,    1,  ...,   14, 3040,   66]], device='cuda:0') shape torch.Size([32, 35])
targets: tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [None]:
for inputs, targets in test_iter:
  print("inputs:", inputs, "shape", inputs.shape)
  print("targets:", targets, "shape", targets.shape)

inputs: tensor([[  13,  134,  134,  ..., 1711, 1711, 1109],
        [1697,   18,  222,  ...,  136, 1484,   10],
        [ 852,    6,   49,  ...,   14,    7,  879],
        ...,
        [   1,    1,    1,  ...,    1,    1,    0],
        [   1,    1,    1,  ...,    1,    1,   70],
        [   1,    1,    1,  ...,    1,    1,    0]], device='cuda:0') shape torch.Size([256, 7])
targets: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
# CNN model using embeddings
class CNN(nn.Module):
  
  def __init__(self, n_vocab, embed_dim, n_outputs):
    super(CNN, self).__init__()
    self.V = n_vocab
    self.D = embed_dim
    self.K = n_outputs

    # embedding layer which gives T*D sequence of word vectors
    self.embed = nn.Embedding(self.V, self.D)

    # convolution layers with max pooling (1D)
    self.conv1 = nn.Conv1d(in_channels=self.D, out_channels=32, kernel_size=3, padding=2)
    self.pool1 = nn.MaxPool1d(2)
    self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
    self.pool2 = nn.MaxPool1d(2)
    self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)

    # final dense layer
    self.fc = nn.Linear(128, self.K)

  def forward(self, X):

    out = self.embed(X)
    out = out.permute(0,2,1) # output from embed --- N * T * D, conv1d requires ---- N * D * T
    out = self.conv1(out)
    out = F.relu(out)
    out = self.pool1(out)
    out = self.conv2(out)
    out = F.relu(out)
    out = self.pool2(out)
    out = self.conv3(out)
    out = F.relu(out)

    out = out.permute(0,2,1) # Permute back

    out, _ = torch.max(out,1)

    out = self.fc(out)
    return out



In [None]:
model = CNN(len(vocab), 20, 1)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
def batch_gd(model, criterion, optimizer, train_iter, test_iter, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    t0 = datetime.now()
    train_loss = []
    for inputs, targets in train_iter:
      # move data to GPU
      #inputs, targets = inputs.to(device), targets.to(device)
      targets = targets.view(-1,1).float()
      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(inputs)
      loss = criterion(outputs, targets)
        
      # Backward and optimize
      loss.backward()
      optimizer.step()

      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss) # a little misleading
    
    test_loss = []
    for inputs, targets in test_iter:
      #inputs, targets = inputs.to(device), targets.to(device)
      targets = targets.view(-1,1).float()
      outputs = model(inputs)
      loss = criterion(outputs, targets)
      test_loss.append(loss.item())
    test_loss = np.mean(test_loss)

    # Save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
      Test Loss: {test_loss:.4f}, Duration: {dt}')
  
  return train_losses, test_losses

In [None]:
train_losses, test_losses = batch_gd(
    model, criterion, optimizer, train_iter, test_iter, epochs=20)

Epoch 1/20, Train Loss: 0.3771,       Test Loss: 0.3283, Duration: 0:00:00.959788
Epoch 2/20, Train Loss: 0.1842,       Test Loss: 0.1904, Duration: 0:00:00.714372
Epoch 3/20, Train Loss: 0.1119,       Test Loss: 0.1638, Duration: 0:00:00.729812
Epoch 4/20, Train Loss: 0.0757,       Test Loss: 0.1394, Duration: 0:00:00.749060
Epoch 5/20, Train Loss: 0.0551,       Test Loss: 0.1365, Duration: 0:00:00.722101
Epoch 6/20, Train Loss: 0.0425,       Test Loss: 0.1290, Duration: 0:00:00.728474
Epoch 7/20, Train Loss: 0.0310,       Test Loss: 0.1668, Duration: 0:00:00.738058
Epoch 8/20, Train Loss: 0.0238,       Test Loss: 0.1512, Duration: 0:00:00.747331
Epoch 9/20, Train Loss: 0.0172,       Test Loss: 0.1355, Duration: 0:00:00.734309
Epoch 10/20, Train Loss: 0.0128,       Test Loss: 0.1421, Duration: 0:00:00.724011
Epoch 11/20, Train Loss: 0.0094,       Test Loss: 0.1394, Duration: 0:00:00.732833
Epoch 12/20, Train Loss: 0.0084,       Test Loss: 0.1583, Duration: 0:00:00.727390
Epoch 13/20, 

In [None]:
# accuracy

n_correct = 0.
n_total = 0.

for inputs, targets in train_iter:
  targets = targets.view(-1,1).float()

  outputs = model(inputs)

  # get the predictions
  predictions = outputs > 0

  # include and update counter
  n_correct += (predictions == targets).sum().item()
  n_total += targets.shape[0]

train_acc = n_correct/n_total

n_correct = 0.
n_total = 0.

for inputs, targets in test_iter:
  targets = targets.view(-1,1).float()

  outputs = model(inputs)

  predictions = outputs > 0

  n_correct += (predictions == targets).sum().item()
  n_total += targets.shape[0]

test_acc = n_correct/n_total

In [None]:
# excellent test accuracy
print(f"Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}")

Train acc: 1.0000, Test acc: 0.9659
