In [76]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import random
from PIL import Image
import matplotlib.pyplot as plt
from autocorrect import spell

In [77]:
%matplotlib inline
pd.options.display.max_colwidth = -1 #does not truncate text

## Data Reading

In [78]:
data_dir = 'C:/Anaconda3/Scripts/'
file_name = 'spam.csv'

In [79]:
df = pd.read_csv(data_dir + file_name,encoding = 'ISO-8859-1')

In [80]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv",,,
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.,,,
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune,,,
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.,,,
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030,,,


In [81]:
col = ['v1', 'v2']
df = df[col]

In [82]:
df.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [83]:
df.columns = ['class', 'msg']

In [84]:
df = df[pd.notnull(df['msg'])] #remove NULL messages

In [85]:
df['class'].factorize() #One not encoding or factorize

(array([0, 0, 1, ..., 0, 0, 0], dtype=int64),
 Index(['ham', 'spam'], dtype='object'))

In [86]:
df['class'] = df['class'].factorize()[0]

In [87]:
df.head()

Unnamed: 0,class,msg
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


## Keep Alphabets

In [88]:
df['clean'] = df['msg'].apply(lambda x : re.sub('[^A-Za-z]', ' ', x)) #regex.substitute

In [89]:
df.head()

Unnamed: 0,class,msg,clean
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,Free entry in a wkly comp to win FA Cup final tkts st May Text FA to to receive entry question std txt rate T C s apply over s
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,0,"Nah I don't think he goes to usf, he lives around here though",Nah I don t think he goes to usf he lives around here though


## Convert Case

In [90]:
df['clean'] = df['clean'].apply(lambda x : x.lower()) 

## Remove Stop Words

In [91]:
stopwords_list = list(set(stopwords.words('english'))) #remove duplicates and put
def remove_stopwords (input_text):
    tokens = input_text.split()
    clean_tokens = [word for word in tokens if word not in stopwords_list]
    return ' '.join(clean_tokens)

In [92]:
df['clean'] = df['clean'].apply(remove_stopwords)

In [93]:
df.head()

Unnamed: 0,class,msg,clean
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives around here though",nah think goes usf lives around though


## Stemming/Lammatizer

In [94]:
lemmatizer = WordNetLemmatizer()

def lemmatize(input_text):
    tokens = input_text.split()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmas)

In [95]:
df['lemmas'] = df['clean'].apply(get_lemmas)

In [96]:
df.head()

Unnamed: 0,class,msg,clean,lemmas
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",go jurong point crazy available bugis n great world la e buffet cine got amore wat,go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply,free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives around here though",nah think goes usf lives around though,nah think go usf life around though


## Spell Correction

In [97]:
def spell_correct(input_text):
    tokens = input_text.split()
    correct = [spell(token) for token in tokens]
    return ' '.join(correct)

In [98]:
df['correct'] = df['lemmas'].apply(spell_correct)

In [None]:
df.head()

## Bag of Words Model

In [45]:
data = df['lemmas'].tolist() #change Lemmas with correct - very imp

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_features=1000)
x = vec.fit_transform(data).toarray()

In [52]:
y = df.iloc[:, 0]

In [53]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64

In [54]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [55]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB(priors=None)

In [56]:
y_pred = classifier.predict(x_test)

In [58]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [59]:
print(accuracy)

0.7968413496051687


## TF-IDF

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True,
                        norm='l2',
                        encoding='latin-1',
                        ngram_range=(1,2),
                        stop_words='english')
tfidf_vec = tfidf.fit_transform(data).toarray()

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2)

In [65]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB(priors=None)

In [66]:
y_pred = classifier.predict(x_test)

In [67]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [68]:
print(accuracy)

0.7739910313901345


In [71]:
df = pd.DataFrame(list(zip(y,y_pred,df['msg'])),
                  columns = ['actual','pred', 'msg'])

In [72]:
df[(df['pred'] == df['actual']) & (df['pred'] == 1).head(20)]

Unnamed: 0,actual,pred,msg
5,1,1,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
8,1,1,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
19,1,1,"England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/Ì¼1.20 POBOXox36504W45WQ 16+"
