# Natural Language Processing Assignment: Spam Filter
## Import necessary libs and datasets

In [74]:
import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [76]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


## train, test split
### 평가에 사용할 예정이니 트레인, 테스트 스플릿 코드는 그대로 유지시켜주세요

In [59]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                   stratify=y, test_size=0.1)

print(len(X_train), len(X_test))

5014 558


## Preprocessing
### 텍스트 전처리함수입니다.

In [77]:
data.head()

Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [61]:
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocess(string: str, *args, **kwargs) -> str:
    string = re.sub('[^a-zA-Z]', ' ', string)  
    string = string.lower()  
    string = string.split(' ')  
    string = [w for w in string if not w in set(stopwords.words('english'))] 
    string = ' '.join(string)    
    return string


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
data['preprocessed'] = [preprocess(row) for row in data['text']]

In [80]:
data

Unnamed: 0,text,isSpam,preprocessed
0,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazy available bugis n gre...
1,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry wkly comp win fa cup final tkts ...
3,U dun say so early hor... U c already then say...,0,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",0,nah think goes usf lives around though
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1,nd time tried contact u u pound priz...
5568,Will Ì_ b going to esplanade fr home?,0,b going esplanade fr home
5569,"Pity, * was in mood for that. So...any other s...",0,pity mood suggestions
5570,The guy did some bitching but I acted like i'd...,0,guy bitching acted like interested buying some...


#### 앞에서 보셨다시피 raw text를 그대로 사용하기엔 무리가 있습니다.(특수기호 및 불용어 문제 등)
#### 따라서 전처리되지 않은 raw string을 전처리하는 함수를 만들어주세요. <br>
```python
preprocess('Helllllo World-!') = 'hello world'
```
<br>

#### ```re``` library를 이용해서 전처리를 쉽게 할 수 있습니다.


[re documentation](https://docs.python.org/3/library/re.html)

## Tokenizing
### 전처리된 텍스트를 토크나이징 해주는 함수입니다.
#### ```SpaCy, nltk``` 등 영어 tokenizing 라이브러리를 쓰셔도 괜찮습니다.

special thanks to 유림

In [81]:
from keras.preprocessing import text, sequence
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [82]:
def tokenize(string: str, *args, **kwargs) -> list:
  tokenized = nltk.tokenize.word_tokenize(string)
  return tokenized

In [84]:
data['tokenized']=data['preprocessed'].apply(lambda x: tokenize(x))

In [85]:
data['tokenized'].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, wkly, comp, win, fa, cup, final,...
3        [u, dun, say, early, hor, u, c, already, say]
4       [nah, think, goes, usf, lives, around, though]
Name: tokenized, dtype: object

<br>

Ex) 
```python
tokenize('hello world!',  *args, **kwargs) = ['hello', 'world']
```

## Build Vocabulary
### 토큰들을 이용해서 자주 등장한 순서대로 n개의 원소를 갖는 딕셔너리를 만들어주세요.

https://docs.python.org/2/library/collections.html 참조

In [None]:
from collections import Counter

In [86]:
def build_vocab(n, *args, **kwargs):
  count = Counter()
  for voca in data['tokenized']:
    count.update(voca)
    
  voca_dic = dict()
  voca_dic['padding_idx'] = 0
  voca_dic['unk_idx'] = 1
    
  for i in range(n-2):
    voca_dic[count.most_common(n)[i][0]] = i+2
  return voca_dic

In [87]:
vocabulary = build_vocab(5000)

In [None]:
vocabulary

<br>

Ex) 
```python
vocab = build_vocab(4, *args, **kwargs)
vocab = {'padding_idx': 0, 'unk_idx': 1, 'hello': 2, 'world': 3}
```

#### 여기서 ```padding_idx```는 패딩에 쓰이는 인덱스, ```unk_idx```는 unknown token을 의미합니다.

### toTensor
#### 토큰들을 텐서로 바꿔주는 함수입니다.

In [None]:
import torch
from torch.autograd import Variable


In [99]:
def toTensor(max_len, *args, **kwargs) -> torch.LongTensor: 
  vectorized = []
  for seq in data['preprocessed']: 
    vectorized.append([vocabulary.get(word,1) for word in seq.split()])
 
  seq_lengths = torch.LongTensor(list(map(len, vectorized)))
  
  # Add padding(0)
  seq_tensor = Variable(torch.zeros((len(vectorized), seq_lengths.max()))).long()
  for idx, (seq, seqlen) in enumerate(zip(vectorized, seq_lengths)):
    if seqlen < max_len:
      seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
    else:
      seq_tensor[idx, :max_len] = torch.LongTensor(seq[:max_len])

  return seq_tensor, seq_lengths
  


In [110]:
seq_tensor, seq_lengths = toTensor(100)

In [111]:
print(seq_tensor[0]) 
print(seq_lengths[0]) # tensor(412)

tensor([  10, 3773,  683,  606,  522, 1129,   25,   56,  232,  844,   73, 2583,
        1130,   12, 3774,   65,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0])
tensor(16)


In [112]:
print(seq_tensor.max())
print(seq_lengths.max())

tensor(4999)
tensor(77)


시퀀스의 max length가 5일 때 다음과 같습니다.
<br>

Ex)
```python
toTensor(5, ['hello', 'world!', 'yonsei']) = torch.LongTensor([2, 3, 1, 0, 0])
```

여기서 ```yonsei``` 단어는 아까 만든 단어장(vocab)에 포함되지 않은 단어로 ```unk_idx```로 처리됩니다.

### 위의 함수들을 이용하고 적절한 코드 및 parameter를 적용해서 
### MailDataset과 train에 쓸 DataLoader를 만들어주세요.

In [113]:
import torch.utils.data.sampler as splr

class MailDataset(Dataset):
  def __init__(self, seq_tensor, seq_lengths, label_tensor, batch_size):
    self.batch_size = batch_size
    self.seq_tensor = seq_tensor
    self.seq_lengths = seq_lengths
    self.label_tensor = label_tensor
    self.sampler = splr.BatchSampler(splr.RandomSampler(self.label_tensor), self.batch_size, False)
    self.sampler_iter = iter(self.sampler)
    
  def __iter__(self):
    self.sampler_iter = iter(self.sampler) # reset sampler iterator
    return self

  def _next_index(self):
    return next(self.sampler_iter) # may raise StopIteration

  def __next__(self):
    index = self._next_index()

    subset_seq_tensor = self.seq_tensor[index]
    subset_seq_lengths = self.seq_lengths[index]
    subset_label_tensor = self.label_tensor[index]

    # order by length to use pack_padded_sequence()
    subset_seq_lengths, perm_idx = subset_seq_lengths.sort(0, descending=True)
    subset_seq_tensor = subset_seq_tensor[perm_idx]
    subset_label_tensor = subset_label_tensor[perm_idx]

    return subset_seq_tensor, subset_seq_lengths, subset_label_tensor

  def __len__(self):
    return len(self.sampler)


In [136]:
label = np.array(y_train)
label = torch.as_tensor(label)
label2 = np.array(y_test)
label2 = torch.as_tensor(label2)
# , dtype=torch.int16 

In [118]:
label

tensor([0, 0, 0,  ..., 0, 0, 1], dtype=torch.int16)

In [137]:
batch_size = 80
p_train = 0.7
p_val = 0.2

train_preprocessed, test_preprocessed = X_train.apply(preprocess), X_test.apply(preprocess)

train_tokenized, test_tokenized = train_preprocessed.apply(tokenize), test_preprocessed.apply(tokenize)

vocab = build_vocab(5000, X.apply(preprocess).apply(tokenize))

max_length = len(label)
max_length2 = len(label2)
train_seq_tensor, train_seq_lengths = toTensor(int(max_length), train_tokenized, vocab)
test_seq_tensor, test_seq_lengths = toTensor(max_length2, test_tokenized, vocab)
train_label=label[:int(max_length)]
test_label=label2[:int(max_length2)]

train_loader = MailDataset(train_seq_tensor, train_seq_lengths, train_label, batch_size)
test_loader = MailDataset(test_seq_tensor, test_seq_lengths, test_label, batch_size)

train_size = len(train_loader.seq_tensor)
test_size = len(test_loader.seq_tensor)


print(train_seq_tensor.shape) 
print(test_seq_tensor.shape) 

torch.Size([5572, 77])
torch.Size([5572, 77])


In [138]:
train_loader = MailDataset(train_seq_tensor, train_seq_lengths, train_label, batch_size)
test_loader = MailDataset(test_seq_tensor, test_seq_lengths, test_label, batch_size)

### 훈련 인스턴스를 사용해서 train 함수를 통해 training을 해주시고,
### eval 함수를 통해 40개의 test example에 대해서 accuracy를 측정해주세요.
### 함수 및 클래스 signature와 내부 코드는 적절히 알아서 짜주시면 됩니다.

CNN for classification을 적용...하려 시도했으나 마지막에 튕김..
다시 해보겠습니다...
https://towardsdatascience.com/convolutional-neural-network-in-natural-language-processing-96d67f91275c

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

# Create word embedding from scratch
embeddings = nn.Embedding(max_length, 100)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Build CNN model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding(max_length, 100)
        self.cnn = nn.Conv2d(1, 100, (3, 100))
        self.clf = nn.Linear(100, 2)

    def forward(self, x):
        # Add word embeddings
        x = self.embeddings(x)
        # Add an extra dimension for CNN
        x = x.unsqueeze(1)
        # Apply CNN
        x = self.cnn(x)
        # Choose the maximum value of each filter and delete the extra dimension
        x = x.max(2)[0].squeeze(2)
        # Choose the most important features for the classification
        x = F.relu(x) 
        #  Apply linear nn for classification
        x = self.clf(x)
        # Return the probability of positive and negative
        return F.softmax(x, 1)

# Use GPU for the model      
model = Model().cuda()
# opmization function
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# calculate the loss
criterio  = nn.CrossEntropyLoss()

In [146]:
from sklearn import metrics

# Function for evaluating
def get_f1(X, y_real):
  y_pred = []
  for x in X:
      # Choose the value with higher probability
      y_pred.append(model(x.cuda()).argmax(1).detach())
  y_pred = torch.cat(y_pred)
  return metrics.f1_score(y_true=y_real, y_pred=y_pred)

# Training steps
epochs = 20
LOSS = []
for e in range(epochs):
    for seq_tensor, seq_tensor_lengths, label in iter(train_loader):
        
        # Delete the prvious values of the gradient
        optimizer.zero_grad()
        seq_tensor, label = seq_tensor.cuda(), label.cuda()
        y_pred = model(seq_tensor)
        loss = criterio(y_pred, label)

        # Compute the gradient
        loss.backward()

        # Apply the optimization method for one step
        optimizer.step()
        
        LOSS.append(loss.item())
        if i%200==0:
            with torch.no_grad():
                f1 = get_f1(X_test, y_test)
            print('Epoch: %d \t Batch: %d \t Loss: %.10f \t F1_test: %.10f'%(e,i, torch.tensor(LOSS[-100:]).mean(), f1))


AttributeError: ignored