##NLP Assignment: Spam Filter

In [33]:
#Import necessary libs and datasets

In [74]:

import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

In [75]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [76]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [77]:
#train, test split

In [78]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                   stratify=y, test_size=0.1)

print(len(X_train), len(X_test))

5014 558


In [79]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: object

In [80]:
import re
import string
from string import digits, punctuation
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = nltk.corpus.stopwords.words('english')
porter = nltk.PorterStemmer()
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [81]:
##preprocessing

import re

def preprocess(string: str, *args, **kwargs) -> str:
    for i in range(len(X)):
      t=X[i]
      t=str(t)
      #cleaned = t.translate(str.maketrans('', '', punctuation))
      # remove digits/numbers
      #cleaned = cleaned.translate(str.maketrans('', '', digits))
      cleaned = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', t)
      cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr',
                     cleaned)
      cleaned = re.sub(r'£|\$', 'moneysymb', cleaned)
      cleaned = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
        'phonenumbr', cleaned)
      cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
      cleaned = re.sub(r'\s+', ' ', cleaned)
      cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
      X[i]=cleaned
      i+=1
    return X

In [82]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [83]:
preprocess(X) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                 will ì_ b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: text, Length: 5572, dtype: object

In [84]:
#Tokenize
import nltk
import keras
from keras.preprocessing.text import Tokenizer
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = nltk.corpus.stopwords.words('english')
porter = nltk.PorterStemmer()
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [93]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def tokenize(string: str, *args, **kwargs) -> list:
    preprocess(X)
    global stop_words
    stop_words = set(stop_words)
    for i in range(len(X)):
        result=[]
        word_tokens=word_tokenize(X[i])
        for w in word_tokens:
            if w not in stop_words:
                result.append(w)
        X[i]=result
        i+=1
    return X

In [95]:
X_t=tokenize(X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [97]:
import tensorflow
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(X_t) 
sequences = tokenizer.texts_to_sequences(X_t) 
vocab = tokenizer.word_index

In [100]:
#Build Vocabulary

dictionary=dict(list(vocab.items())[:2000])
dictionary['unk_idx']=1
dictionary['padding_idx']=0
dictionary

{'u': 1,
 'call': 2,
 '2': 3,
 'httpaddr': 4,
 'phonenumbr': 5,
 'get': 6,
 'ur': 7,
 'gt': 8,
 'lt': 9,
 '4': 10,
 'ok': 11,
 'free': 12,
 'go': 13,
 'know': 14,
 'got': 15,
 'like': 16,
 'good': 17,
 'day': 18,
 'come': 19,
 'time': 20,
 'love': 21,
 'send': 22,
 'want': 23,
 'text': 24,
 'one': 25,
 'txt': 26,
 'going': 27,
 'need': 28,
 'home': 29,
 'r': 30,
 'lor': 31,
 'sorry': 32,
 'see': 33,
 'stop': 34,
 'today': 35,
 'still': 36,
 'back': 37,
 'n': 38,
 'da': 39,
 'reply': 40,
 'mobile': 41,
 'dont': 42,
 'k': 43,
 'tell': 44,
 'hi': 45,
 'new': 46,
 'later': 47,
 'take': 48,
 'think': 49,
 'please': 50,
 'pls': 51,
 'week': 52,
 'phone': 53,
 'ì_': 54,
 'dear': 55,
 '1': 56,
 'c': 57,
 'well': 58,
 'much': 59,
 'great': 60,
 'oh': 61,
 'night': 62,
 'claim': 63,
 'hey': 64,
 'hope': 65,
 'na': 66,
 'wat': 67,
 'msg': 68,
 'happy': 69,
 'make': 70,
 'way': 71,
 'yes': 72,
 'work': 73,
 'give': 74,
 'number': 75,
 'message': 76,
 'e': 77,
 'wan': 78,
 'say': 79,
 'prize': 80,
