# Natural Language Processing Assignment: Spam Filter

## 1) Import necessary libs and datasets

In [1]:
import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [2]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


## 2) train, test split
### 평가에 사용할 예정이니 트레인, 테스트 스플릿 코드는 그대로 유지시켜주세요

In [3]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                   stratify=y, test_size=0.1)

print(len(X_train), len(X_test))

5014 558


In [4]:
import string
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 3-1) Preprocessing

ref: https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d

In [5]:
idx = [i for i,item in enumerate(X_train) if len(item.split())<3]
X_train = [x for i,x in enumerate(X_train) if i not in idx]
y_train = [x for i,x in enumerate(y_train) if i not in idx]

In [6]:
idx2 = [i for i,item in enumerate(X_test) if len(item.split())<3]
X_test = [x for i,x in enumerate(X_test) if i not in idx2]
y_test = [x for i,x in enumerate(y_test) if i not in idx2]

In [7]:
p = re.compile(r'(.)\1{3,}', re.IGNORECASE)

def preprocess(word: str) -> str:
    word = word.lower().strip()
    word = re.sub(r"\d+|\\", "", word)
    word = re.sub("["+string.punctuation+"]", "", word)
    word = p.sub(r'\1\1',word)
    word = str(TextBlob(word).correct()) 
    return(word)

In [8]:
sent_clean = [preprocess(sent) for sent in X_train]

In [9]:
sent_clean2 = [preprocess(sent) for sent in X_test]

## 3-2) Tokenizing

In [10]:
from nltk.tokenize import word_tokenize

In [11]:
def tokenize(word: str) -> list:
    tokens = word_tokenize(word)
    word = [item for item in tokens if item not in stop_words] #Stop words removal
    return(word)

In [12]:
word_clean = [tokenize(sent) for sent in sent_clean]
word_clean = sum(word_clean, [])

In [13]:
word_clean2 = [tokenize(sent) for sent in sent_clean2]
word_clean2 = sum(word_clean2, [])

## 3-3) Build Vocabulary

In [14]:
from collections import Counter
vocab_count = Counter(word_clean)
vocab_count = vocab_count.most_common(len(vocab_count))
vocab_to_int = {word: index+2 for index, (word, count) in enumerate(vocab_count)}
vocab_to_int.update({'padding__idx':0})
vocab_to_int.update({'unk_idx':1})

In [15]:
vocab_count2 = Counter(word_clean2)
vocab_count2 = vocab_count2.most_common(len(vocab_count2))
vocab_to_int2 = {word: index+2 for index, (word, count) in enumerate(vocab_count2)}
vocab_to_int2.update({'padding__idx':0})
vocab_to_int2.update({'unk_idx':1})

#### 여기서 ```padding_idx```는 패딩에 쓰이는 인덱스, ```unk_idx```는 unknown token을 의미합니다.

### 3-4) toTensor

In [16]:
from torch.autograd import Variable

In [25]:
# Tokenize & Vectorize sequences
vectorized_seqs = []
vectorized_seqs2 = []
for seq in sent_clean: 
    vectorized_seqs.append([vocab_to_int.get(word,1) for word in seq.split()])

for seq in sent_clean2: 
    vectorized_seqs2.append([vocab_to_int2.get(word,1) for word in seq.split()])

# Save the lengths of sequences
seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
seq_lengths2 = torch.LongTensor(list(map(len, vectorized_seqs2)))

# Add padding(0)
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

seq_tensor2 = Variable(torch.zeros((len(vectorized_seqs2), seq_lengths2.max()))).long()
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs2, seq_lengths2)):
    seq_tensor2[idx, :seqlen] = torch.LongTensor(seq)

print(seq_lengths.max()) # tensor(30772)
print(seq_tensor[0]) # tensor([ 20,  77, 666,  ...,   0,   0,   0])
print(seq_lengths[0]) # tensor(412)

tensor(171)
tensor([ 48,   1,   1,  63,   1,   1,   1, 492,   1, 127,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0])
tensor(10)


## 4) DataLoader

In [26]:
import torch.utils.data.sampler as splr

class CustomDataLoader(object):
  def __init__(self, seq_tensor, seq_lengths, label_tensor, batch_size):
    self.batch_size = batch_size
    self.seq_tensor = seq_tensor
    self.seq_lengths = seq_lengths
    self.label_tensor = label_tensor
    self.sampler = splr.BatchSampler(splr.RandomSampler(self.label_tensor), self.batch_size, False)
    self.sampler_iter = iter(self.sampler)
    
  def __iter__(self):
    self.sampler_iter = iter(self.sampler) # reset sampler iterator
    return self

  def _next_index(self):
    return next(self.sampler_iter) # may raise StopIteration

  def __next__(self):
    index = self._next_index()

    subset_seq_tensor = self.seq_tensor[index]
    subset_seq_lengths = self.seq_lengths[index]
    subset_label_tensor = self.label_tensor[index]

    subset_seq_lengths, perm_idx = subset_seq_lengths.sort(0, descending=True)
    subset_seq_tensor = subset_seq_tensor[perm_idx]
    subset_label_tensor = subset_label_tensor[perm_idx]

    return subset_seq_tensor, subset_seq_lengths, subset_label_tensor

  def __len__(self):
    return len(self.sampler)

In [27]:
# shuffle data
label = np.array(y_train)
label = torch.as_tensor(label, dtype=torch.int16)
shuffled_idx = torch.randperm(len(label))

seq_tensor = seq_tensor[shuffled_idx]
seq_lengths = seq_lengths[shuffled_idx]
label = label[shuffled_idx]

# divide train data into 2 sets
PCT_TRAIN = 0.7 # 70% of data will be train set 
PCT_VALID = 0.3 # 30% of data will be validation set

length = len(label)
train_seq_tensor = seq_tensor[:int(length*PCT_TRAIN)] 
train_seq_lengths = seq_lengths[:int(length*PCT_TRAIN)]
train_label = label[:int(length*PCT_TRAIN)]

valid_seq_tensor = seq_tensor[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_seq_lengths = seq_lengths[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))] 
valid_label = label[int(length*PCT_TRAIN):int(length*(PCT_TRAIN+PCT_VALID))]

label2 = np.array(y_test)
label2 = torch.as_tensor(label2, dtype=torch.int16)
shuffled_idx2 = torch.randperm(len(label2))

seq_tensor2 = seq_tensor2[shuffled_idx2]
seq_lengths2 = seq_lengths2[shuffled_idx2]
label2 = label2[shuffled_idx2]

test_seq_tensor = seq_tensor2[:40]
test_seq_lengths = seq_lengths2[:40]
test_label = label2[:40]

print(train_seq_tensor.shape) 
print(valid_seq_tensor.shape) 
print(test_seq_tensor.shape) 

batch_size = 80
train_loader = CustomDataLoader(train_seq_tensor, train_seq_lengths, train_label, batch_size)
valid_loader = CustomDataLoader(valid_seq_tensor, valid_seq_lengths, valid_label, batch_size)
test_loader = CustomDataLoader(test_seq_tensor, test_seq_lengths, test_label, batch_size)

torch.Size([3458, 171])
torch.Size([1482, 171])
torch.Size([40, 70])


In [28]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SpamHamLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, n_layers,\
                 drop_lstm=0.1, drop_out = 0.1):

        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_lstm, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_out)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, seq_lengths):

        # embeddings
        embedded_seq_tensor = self.embedding(x)
                
        # pack, remove pads
        packed_input = pack_padded_sequence(embedded_seq_tensor, seq_lengths.cpu().numpy(), batch_first=True)
        
        # lstm
        packed_output, (ht, ct) = self.lstm(packed_input, None)
 
        # unpack, recover padded sequence
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
       
        # collect the last output in each batch
        last_idxs = (input_sizes - 1).to(device) # last_idxs = input_sizes - torch.ones_like(input_sizes)
        output = torch.gather(output, 1, last_idxs.view(-1, 1).unsqueeze(2).repeat(1, 1, self.hidden_dim)).squeeze() # [batch_size, hidden_dim]
        
        # dropout and fully-connected layer
        output = self.dropout(output)
        output = self.fc(output).squeeze()
               
        # sigmoid function
        output = self.sig(output)
        
        return output

## 5) Modeling

In [29]:
# Instantiate the model w/ hyperparams

vocab_size = len(vocab_to_int)
embedding_dim = int(vocab_size ** 0.25)
hidden_dim = 15
output_size = 1
n_layers = 2
device = "cuda" if torch.cuda.is_available() else "cpu" 
net = SpamHamLSTM(vocab_size, embedding_dim, hidden_dim, output_size, n_layers, \
                 0.2, 0.2)
net = net.to(device)
print(net)

SpamHamLSTM(
  (embedding): Embedding(5935, 8)
  (lstm): LSTM(8, 15, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=15, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [30]:
criterion = nn.BCELoss()

lr=0.03
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,\
                                                       mode = 'min', \
                                                      factor = 0.5,\
                                                      patience = 2)

In [31]:
import numpy as np

# training params
epochs = 6 
counter = 0
print_every = 10
clip=5 # gradient clipping

net.train()
# train for some number of epochs
val_losses = []
for e in range(epochs):

    scheduler.step(e)

    for seq_tensor, seq_tensor_lengths, label in iter(train_loader):
        counter += 1
               
        seq_tensor = seq_tensor.to(device)
        seq_tensor_lengths = seq_tensor_lengths.to(device)
        label = label.to(device)
 
        # get the output from the model
        output = net(seq_tensor, seq_tensor_lengths)
    
        # get the loss and backprop
        loss = criterion(output, label.float())
        optimizer.zero_grad() 
        loss.backward()
        
        # prevent the exploding gradient
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            
            val_losses_in_itr = []
            sums = []
            sizes = []
            
            net.eval()
            
            for seq_tensor, seq_tensor_lengths, label in iter(valid_loader):

                seq_tensor = seq_tensor.to(device)
                seq_tensor_lengths = seq_tensor_lengths.to(device)
                label = label.to(device)
                output = net(seq_tensor, seq_tensor_lengths)
                
                # losses
                val_loss = criterion(output, label.float())     
                val_losses_in_itr.append(val_loss.item())
                
                # accuracy
                binary_output = (output >= 0.5).short() # short(): torch.int16
                right_or_not = torch.eq(binary_output, label)
                sums.append(torch.sum(right_or_not).float().item())
                sizes.append(right_or_not.shape[0])
            
            accuracy = sum(sums) / sum(sizes)
            
            net.train()
            print("Epoch: {:2d}/{:2d}\t".format(e+1, epochs),
                  "Steps: {:3d}\t".format(counter),
                  "Loss: {:.6f}\t".format(loss.item()),
                  "Val Loss: {:.6f}\t".format(np.mean(val_losses_in_itr)),
                  "Accuracy: {:.3f}".format(accuracy))

Epoch:  1/ 6	 Steps:  10	 Loss: 0.386192	 Val Loss: 0.396962	 Accuracy: 0.866
Epoch:  1/ 6	 Steps:  20	 Loss: 0.459664	 Val Loss: 0.389783	 Accuracy: 0.866
Epoch:  1/ 6	 Steps:  30	 Loss: 0.259173	 Val Loss: 0.363261	 Accuracy: 0.866
Epoch:  1/ 6	 Steps:  40	 Loss: 0.270328	 Val Loss: 0.272558	 Accuracy: 0.866
Epoch:  2/ 6	 Steps:  50	 Loss: 0.237449	 Val Loss: 0.182138	 Accuracy: 0.955
Epoch:  2/ 6	 Steps:  60	 Loss: 0.098895	 Val Loss: 0.166708	 Accuracy: 0.938
Epoch:  2/ 6	 Steps:  70	 Loss: 0.069132	 Val Loss: 0.137469	 Accuracy: 0.959
Epoch:  2/ 6	 Steps:  80	 Loss: 0.032315	 Val Loss: 0.121557	 Accuracy: 0.962
Epoch:  3/ 6	 Steps:  90	 Loss: 0.112846	 Val Loss: 0.137898	 Accuracy: 0.965
Epoch:  3/ 6	 Steps: 100	 Loss: 0.029517	 Val Loss: 0.120280	 Accuracy: 0.957
Epoch:  3/ 6	 Steps: 110	 Loss: 0.101225	 Val Loss: 0.105226	 Accuracy: 0.968
Epoch:  3/ 6	 Steps: 120	 Loss: 0.025927	 Val Loss: 0.090130	 Accuracy: 0.970
Epoch:  3/ 6	 Steps: 130	 Loss: 0.065104	 Val Loss: 0.095164	 Ac

In [33]:
test_losses = []
sums = []
sizes = []

net.eval()

test_losses = []

for seq_tensor, seq_tensor_lengths, label in iter(test_loader):

    seq_tensor = seq_tensor.to(device)
    seq_tensor_lengths = seq_tensor_lengths.to(device)
    label = label.to(device)
    output = net(seq_tensor, seq_tensor_lengths)

    # losses
    test_loss = criterion(output, label.float())     
    test_losses.append(test_loss.item())

    # accuracy
    binary_output = (output >= 0.5).short() # short(): torch.int16
    right_or_not = torch.eq(binary_output, label)
    sums.append(torch.sum(right_or_not).float().item())
    sizes.append(right_or_not.shape[0])

accuracy = np.sum(sums) / np.sum(sizes)
print("Test Loss: {:.6f}\t".format(np.mean(test_losses)),
      "Accuracy: {:.3f}".format(accuracy))

Test Loss: 0.648150	 Accuracy: 0.875


In [None]:
# ref: https://github.com/sijoonlee/spam-ham-walkthrough/blob/master/walkthrough.ipynb