# Natural Language Processing Assignment: Spam Filter

In [1]:
import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import re
import nltk
from nltk.tokenize import word_tokenize

import tensorflow
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [2]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
#                                                   stratify=y, test_size=0.1)

#print(len(X_train), len(X_test))

In [92]:
def cleaning(text):
    text= re.sub(r'[^ ㄱ-ㅣ가-힣A-Za-z]', '', text) #특수기호 제거, 정규 표현식    
    text1=[]
    for word in nltk.tokenize.word_tokenize(text):
        word = word.lower()
        if word not in nltk.corpus.stopwords.words('english'): #불용어 제거
            text1.append(word)
    result = ' '.join(text1)
        
    return result

X1 = X.apply(cleaning)

In [8]:
X1

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    nd time tried contact u u pound prize claim ea...
5568                            b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: text, Length: 5572, dtype: object

In [56]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words
token = get_corpus(X1)
token

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat',
 'ok',
 'lar',
 'joking',
 'wif',
 'u',
 'oni',
 'free',
 'entry',
 'wkly',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 'st',
 'may',
 'text',
 'fa',
 'receive',
 'entry',
 'questionstd',
 'txt',
 'ratetcs',
 'apply',
 'overs',
 'u',
 'dun',
 'say',
 'early',
 'hor',
 'u',
 'c',
 'already',
 'say',
 'nah',
 'dont',
 'think',
 'goes',
 'usf',
 'lives',
 'around',
 'though',
 'freemsg',
 'hey',
 'darling',
 'weeks',
 'word',
 'back',
 'id',
 'like',
 'fun',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'send',
 'rcv',
 'even',
 'brother',
 'like',
 'speak',
 'treat',
 'like',
 'aids',
 'patent',
 'per',
 'request',
 'melle',
 'melle',
 'oru',
 'minnaminunginte',
 'nurungu',
 'vettam',
 'set',
 'callertune',
 'callers',
 'press',
 'copy',
 'friends',
 'callertune',
 'winner',
 'valued',
 'network',
 'customer',
 'selected',
 'receiv

In [64]:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(token) 
sequences = tokenizer.texts_to_sequences(token) 
vocab = tokenizer.word_index
len(vocab)

8404

In [73]:
vocab1=dict(list(vocab.items())[:2000]) 
del(vocab1['u'])
vocab1['unk_idx']=1
vocab1['padding_idx']=0
vocab1

{'call': 2,
 'im': 3,
 'get': 4,
 'ur': 5,
 'dont': 6,
 'go': 7,
 'free': 8,
 'ok': 9,
 'ltgt': 10,
 'know': 11,
 'got': 12,
 'like': 13,
 'ill': 14,
 'good': 15,
 'come': 16,
 'day': 17,
 'time': 18,
 'love': 19,
 'want': 20,
 'send': 21,
 'text': 22,
 'going': 23,
 'one': 24,
 'need': 25,
 'txt': 26,
 'home': 27,
 'lor': 28,
 'see': 29,
 'sorry': 30,
 'r': 31,
 'still': 32,
 'stop': 33,
 'back': 34,
 'n': 35,
 'reply': 36,
 'today': 37,
 'mobile': 38,
 'tell': 39,
 'new': 40,
 'well': 41,
 'later': 42,
 'hi': 43,
 'think': 44,
 'da': 45,
 'please': 46,
 'take': 47,
 'phone': 48,
 'cant': 49,
 'week': 50,
 'claim': 51,
 'much': 52,
 'night': 53,
 'dear': 54,
 'oh': 55,
 'great': 56,
 'hey': 57,
 'pls': 58,
 'na': 59,
 'happy': 60,
 'hope': 61,
 'make': 62,
 'thats': 63,
 'way': 64,
 'give': 65,
 'work': 66,
 'wat': 67,
 'wan': 68,
 'number': 69,
 'prize': 70,
 'right': 71,
 'yes': 72,
 'say': 73,
 'already': 74,
 'tomorrow': 75,
 'ask': 76,
 'really': 77,
 'yeah': 78,
 'said': 79,
 'e

In [93]:
from keras.preprocessing import sequence
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(X1)
tokenized_x = tokenizer.texts_to_sequences(X1)
X2 = sequence.pad_sequences(tokenized_x, maxlen=10)
X2

array([[ 556, 1093,   35, ..., 1094,   12,   67],
       [   0,    0,    0, ...,  340,    1, 1635],
       [ 503, 1637,  149, ...,  355,   26,  268],
       ...,
       [   0,    0,    0, ...,    0,    0, 1091],
       [ 255, 1143,  912, ...,  829,  117,    8],
       [   0,    0,    0, ...,    0,  376,  157]])

In [96]:
import torch
from torchvision import transforms
from torch.autograd import Variable

In [94]:
def toTensor(data, vocab):
    
    tensor = torch.zeros(len(vocab))
    for w in data:
        index = vocab.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index=vocab['unk_idx']
            tensor[index]+=1.
    
    return tensor

In [97]:
X3=torch.cat([Variable(toTensor(X2[i],vocab1)).view(1,-1)
           for i in range(0, len(X2))])

y = torch.cat([Variable(torch.LongTensor(np.array(y)))])

In [98]:
X3

tensor([[ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        ...,
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.]])

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X3, y, random_state=0, stratify=y,
                                                    test_size=0.2)

In [117]:
X_train

tensor([[ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        ...,
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.]])