# Natural Language Processing Assignment: Spam Filter

In [1]:
import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import re
import nltk
from nltk.tokenize import word_tokenize

import tensorflow
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [2]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
#                                                   stratify=y, test_size=0.1)

#print(len(X_train), len(X_test))

In [4]:
def cleaning(text):
    text= re.sub(r'[^ ㄱ-ㅣ가-힣A-Za-z]', '', text) #특수기호 제거, 정규 표현식    
    text1=[]
    for word in nltk.tokenize.word_tokenize(text):
        word = word.lower()
        if word not in nltk.corpus.stopwords.words('english'): #불용어 제거
            text1.append(word)
    result = ' '.join(text1)
        
    return result

X1 = X.apply(cleaning)

In [46]:
X1 # for modeling

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    nd time tried contact u u pound prize claim ea...
5568                            b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: text, Length: 5572, dtype: object

In [7]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words
token = get_corpus(X1)
token[:5]

['go', 'jurong', 'point', 'crazy', 'available']

-------------

In [24]:
from collections import Counter
vocab_count = Counter(token)
vocab_count = vocab_count.most_common(len(vocab_count))
vocab_count[:5]

[('u', 1142), ('call', 579), ('im', 474), ('get', 390), ('ur', 384)]

In [27]:
vocab_to_int = {word : index+2 for index, (word, count) in enumerate(vocab_count)}
vocab_to_int.update({'padding_idx': 0}) # index 0 for padding
vocab_to_int.update({'unk_idx': 1})
vocab_to_int

{'u': 2,
 'call': 3,
 'im': 4,
 'get': 5,
 'ur': 6,
 'dont': 7,
 'go': 8,
 'free': 9,
 'ok': 10,
 'ltgt': 11,
 'know': 12,
 'got': 13,
 'like': 14,
 'ill': 15,
 'good': 16,
 'come': 17,
 'day': 18,
 'time': 19,
 'love': 20,
 'want': 21,
 'send': 22,
 'text': 23,
 'going': 24,
 'one': 25,
 'need': 26,
 'txt': 27,
 'home': 28,
 'lor': 29,
 'see': 30,
 'sorry': 31,
 'r': 32,
 'still': 33,
 'stop': 34,
 'back': 35,
 'n': 36,
 'reply': 37,
 'today': 38,
 'mobile': 39,
 'tell': 40,
 'new': 41,
 'well': 42,
 'later': 43,
 'hi': 44,
 'think': 45,
 'da': 46,
 'please': 47,
 'take': 48,
 'phone': 49,
 'cant': 50,
 'week': 51,
 'claim': 52,
 'much': 53,
 'night': 54,
 'dear': 55,
 'oh': 56,
 'great': 57,
 'hey': 58,
 'pls': 59,
 'na': 60,
 'happy': 61,
 'hope': 62,
 'make': 63,
 'thats': 64,
 'way': 65,
 'give': 66,
 'work': 67,
 'wat': 68,
 'wan': 69,
 'number': 70,
 'prize': 71,
 'right': 72,
 'yes': 73,
 'say': 74,
 'already': 75,
 'tomorrow': 76,
 'ask': 77,
 'really': 78,
 'yeah': 79,
 'said

In [43]:
import torch
from torch.autograd import Variable

# 각 단어의 index(freq순)
vectorized_seqs = []
for seq in X1: 
    vectorized_seqs.append([vocab_to_int.get(word,1) for word in seq.split()])

# 각 문장의 단어 개수
seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))

# padding (문장 별 단어 개수가 다르니까 0으로 맞추기)
seq_tensor = Variable(torch.zeros((len(vectorized_seqs), seq_lengths.max()))).long()
for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
    seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

seq_tensor

tensor([[   8, 3837,  665,  ...,    0,    0,    0],
        [  10,  218, 1235,  ...,    0,    0,    0],
        [   9,  356,  624,  ...,    0,    0,    0],
        ...,
        [8402, 1092, 8403,  ...,    0,    0,    0],
        [ 396, 8405, 3788,  ...,    0,    0,    0],
        [2291,  377,  158,  ...,    0,    0,    0]])

----------------

In [8]:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(token) 
sequences = tokenizer.texts_to_sequences(token) 
vocab = tokenizer.word_index
len(vocab)

8404

In [31]:
vocab1=dict(list(vocab.items())[1:2000]) 
vocab1['unk_idx']=1
vocab1['padding_idx']=0
vocab1

{'call': 2,
 'im': 3,
 'get': 4,
 'ur': 5,
 'dont': 6,
 'go': 7,
 'free': 8,
 'ok': 9,
 'ltgt': 10,
 'know': 11,
 'got': 12,
 'like': 13,
 'ill': 14,
 'good': 15,
 'come': 16,
 'day': 17,
 'time': 18,
 'love': 19,
 'want': 20,
 'send': 21,
 'text': 22,
 'going': 23,
 'one': 24,
 'need': 25,
 'txt': 26,
 'home': 27,
 'lor': 28,
 'see': 29,
 'sorry': 30,
 'r': 31,
 'still': 32,
 'stop': 33,
 'back': 34,
 'n': 35,
 'reply': 36,
 'today': 37,
 'mobile': 38,
 'tell': 39,
 'new': 40,
 'well': 41,
 'later': 42,
 'hi': 43,
 'think': 44,
 'da': 45,
 'please': 46,
 'take': 47,
 'phone': 48,
 'cant': 49,
 'week': 50,
 'claim': 51,
 'much': 52,
 'night': 53,
 'dear': 54,
 'oh': 55,
 'great': 56,
 'hey': 57,
 'pls': 58,
 'na': 59,
 'happy': 60,
 'hope': 61,
 'make': 62,
 'thats': 63,
 'way': 64,
 'give': 65,
 'work': 66,
 'wat': 67,
 'wan': 68,
 'number': 69,
 'prize': 70,
 'right': 71,
 'yes': 72,
 'say': 73,
 'already': 74,
 'tomorrow': 75,
 'ask': 76,
 'really': 77,
 'yeah': 78,
 'said': 79,
 'e

In [45]:
from keras.preprocessing import sequence
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X1)
tokenized_x = tokenizer.texts_to_sequences(X1)
X2 = sequence.pad_sequences(tokenized_x, maxlen=10)
X2

array([[  35,   56,  253, ...,   12, 3837,   67],
       [   0,    0,    0, ...,  340,    1, 1635],
       [ 176,   22, 1636, ..., 2574,  268, 2575],
       ...,
       [   0,    0,    0, ..., 1091, 8402, 8403],
       [ 255, 1143,  912, ...,  829,  117,    8],
       [   0,    0,    0, ..., 2290,  376,  157]])

In [14]:
import torch
from torchvision import transforms
from torch.autograd import Variable

In [15]:
def toTensor(data, vocab):
    
    tensor = torch.zeros(len(vocab))
    for w in data:
        index = vocab.get(w)
        if index!=None:
            tensor[index]+=1.
        else:
            index=vocab['unk_idx']
            tensor[index]+=1.
    
    return tensor

In [16]:
X3=torch.cat([Variable(toTensor(X2[i],vocab1)).view(1,-1)
           for i in range(0, len(X2))])

y = torch.cat([Variable(torch.LongTensor(np.array(y)))])

In [17]:
X3

tensor([[ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        ...,
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.],
        [ 0., 10.,  0.,  ...,  0.,  0.,  0.]])

----------------

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=0, stratify=y,
                                                    test_size=0.2)

In [48]:
X_train

1257                                         also cbe pay
5461                            ok thk got u wan come wat
1612    rtking pro video club need help inforingtoneki...
2179                         popping ltgt ibuprofens help
2638                                    gobi arts college
                              ...                        
3297    message free welcome new improved sex dogging ...
1054    hiya comin bristol st week april les got rudi ...
245                 late said website didnt dont slippers
1235    opinion jada kusruthi lovable silent spl chara...
3361                            messages phone im holding
Name: text, Length: 4457, dtype: object

In [51]:
max_features = 2000
maxlen = 30

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [52]:
X_train

array([[   0,    0,    0, ...,  105, 1195,  329],
       [   0,    0,    0, ...,   66,   16,   74],
       [   0,    0,    0, ...,  435,  925,  241],
       ...,
       [   0,    0,    0, ...,   78,    8, 1763],
       [   0,    0,    0, ...,  452,   57,   38],
       [   0,    0,    0, ...,   47,    3, 1413]])

In [64]:
batch_size = 32
epochs = 5
embed_size = 50

In [59]:
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU

#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(max_features, output_dim=embed_size, input_length = maxlen, trainable=True))
#LSTM 
model.add(Bidirectional(LSTM(units=128, return_sequences = True)))
model.add(Bidirectional(GRU(units=32)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [65]:
history = model.fit(X_train, y_train, batch_size = batch_size , 
                    validation_data = (X_test,y_test) , epochs = 20)



TypeError: Expected DataType for argument 'Tout' not torch.int64.