### NLP Assignment : Spam Filter

#### Import necessary libs and datasets

In [146]:
import pandas as pd
import numpy as np
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [147]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [148]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


#### train, test split

In [149]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                   stratify=y, test_size=0.1)

print(len(X_train), len(X_test))

5014 558


In [150]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [151]:
#import nltk
#nltk.download('stopwords')

In [152]:
import re
def preprocessing(text):
    pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail주소제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '<[^>]*>' # HTML 태그 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '[^\w\s]' # 특수기호제거
    text = re.sub(pattern=pattern, repl='', string=text)
    text = text.lower() # 소문자로
    pattern = '[0-9]' # 숫자 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    text = text.strip() # 공백 제거
    stop_words = set(stopwords.words('english')) # 불용어 제거
    word_tokens = word_tokenize(text)
    result = ""
    for w in word_tokens: 
        if w not in stop_words: 
            result = result +" "+ w
    return result

In [153]:
X1 = X.apply(preprocessing)

In [154]:
X1.head()

0     go jurong point crazy available bugis n great...
1                              ok lar joking wif u oni
2     free entry wkly comp win fa cup final tkts st...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: text, dtype: object

In [155]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words
token = get_corpus(X1)

In [156]:
token

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat',
 'ok',
 'lar',
 'joking',
 'wif',
 'u',
 'oni',
 'free',
 'entry',
 'wkly',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 'st',
 'may',
 'text',
 'fa',
 'receive',
 'entry',
 'questionstd',
 'txt',
 'ratetcs',
 'apply',
 'overs',
 'u',
 'dun',
 'say',
 'early',
 'hor',
 'u',
 'c',
 'already',
 'say',
 'nah',
 'dont',
 'think',
 'goes',
 'usf',
 'lives',
 'around',
 'though',
 'freemsg',
 'hey',
 'darling',
 'weeks',
 'word',
 'back',
 'id',
 'like',
 'fun',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'send',
 'å',
 'rcv',
 'even',
 'brother',
 'like',
 'speak',
 'treat',
 'like',
 'aids',
 'patent',
 'per',
 'request',
 'melle',
 'melle',
 'oru',
 'minnaminunginte',
 'nurungu',
 'vettam',
 'set',
 'callertune',
 'callers',
 'press',
 'copy',
 'friends',
 'callertune',
 'winner',
 'valued',
 'network',
 'customer',
 'selected',
 '

In [157]:
import tensorflow
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(token) 
sequences = tokenizer.texts_to_sequences(token) 
vocab = tokenizer.word_index
len(vocab)

8473

In [158]:
dictionary=dict(list(vocab.items())[:2000]) 
del(dictionary['å'])
dictionary['unk_idx']=1
dictionary['padding_idx']=0
dictionary

{'u': 1,
 'call': 2,
 'im': 3,
 'get': 4,
 'ur': 5,
 'go': 6,
 'dont': 7,
 'free': 8,
 'ok': 9,
 'ltgt': 10,
 'know': 12,
 'got': 13,
 'like': 14,
 'ill': 15,
 'good': 16,
 'come': 17,
 'day': 18,
 'time': 19,
 'love': 20,
 'want': 21,
 'send': 22,
 'text': 23,
 'going': 24,
 'one': 25,
 'need': 26,
 'txt': 27,
 'home': 28,
 'lor': 29,
 'see': 30,
 'sorry': 31,
 'r': 32,
 'still': 33,
 'stop': 34,
 'back': 35,
 'n': 36,
 'reply': 37,
 'today': 38,
 'mobile': 39,
 'tell': 40,
 'new': 41,
 'later': 42,
 'well': 43,
 'hi': 44,
 'think': 45,
 'da': 46,
 'please': 47,
 'take': 48,
 'phone': 49,
 'cant': 50,
 'ì': 51,
 'week': 52,
 'claim': 53,
 'much': 54,
 'night': 55,
 'dear': 56,
 'oh': 57,
 'great': 58,
 'hey': 59,
 'pls': 60,
 'na': 61,
 'happy': 62,
 'hope': 63,
 'make': 64,
 'way': 65,
 'give': 66,
 'work': 67,
 'wat': 68,
 'wan': 69,
 'thats': 70,
 'number': 71,
 'prize': 72,
 'right': 73,
 'yes': 74,
 'say': 75,
 'already': 76,
 'tomorrow': 77,
 'ask': 78,
 'really': 79,
 'yeah': 8

In [159]:
from keras.preprocessing import sequence
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(X1)
tokenized_x = tokenizer.texts_to_sequences(X1)
X2 = sequence.pad_sequences(tokenized_x, maxlen=10)
X2

array([[ 558, 1096,   36, ..., 1097,   13,   68],
       [   0,    0,    0, ...,  341,    1, 1643],
       [ 504, 1645,  150, ...,  355,   27,  268],
       ...,
       [   0,    0,    0, ...,    0,    0, 1094],
       [ 269, 1147,  916, ...,  833,  119,    8],
       [   0,    0,    0, ...,    0,  379,  160]])

In [160]:
import torch
from torchvision import transforms
from torch.autograd import Variable

In [161]:
def toTensor(data, vocab):
    
    tensor = torch.zeros(len(vocab))
    for w in data:
        index = vocab.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index=vocab['unk_idx']
            tensor[index]+=1.
    
    return tensor