In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

porter = PorterStemmer()

In [16]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [17]:
df = df[['v1','v2']]
df = df.rename(columns={'v1':'class', 'v2':'sms'})
df.head()

Unnamed: 0,class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
#check for null values
df.isnull().sum()

class    0
sms      0
dtype: int64

## Preprocessing Data

In [19]:
def cleanText(text):
    text = text.lower()
    text = re.sub('[^\w\s]','',text)
    txt=[]
    for w in text.split():
        stemWord = porter.stem(w)
        txt.append(stemWord)
    txt = ' '.join(txt)
    return txt

In [20]:
#clean text
df['sms'] = df['sms'].apply(cleanText)
df.head()

Unnamed: 0,class,sms
0,ham,go until jurong point crazi avail onli in bugi...
1,ham,ok lar joke wif u oni
2,spam,free entri in 2 a wkli comp to win fa cup fina...
3,ham,u dun say so earli hor u c alreadi then say
4,ham,nah i dont think he goe to usf he live around ...


In [21]:
df

Unnamed: 0,class,sms
0,ham,go until jurong point crazi avail onli in bugi...
1,ham,ok lar joke wif u oni
2,spam,free entri in 2 a wkli comp to win fa cup fina...
3,ham,u dun say so earli hor u c alreadi then say
4,ham,nah i dont think he goe to usf he live around ...
...,...,...
5567,spam,thi is the 2nd time we have tri 2 contact u u ...
5568,ham,will ì_ b go to esplanad fr home
5569,ham,piti wa in mood for that soani other suggest
5570,ham,the guy did some bitch but i act like id be in...


## Features Extraction 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# Features Extraction by Binary Encoding
ve = CountVectorizer(binary=True)
featuresByBinary =ve.fit_transform(df['sms'])
pd.DataFrame(featuresByBinary.toarray(),columns=ve.vocabulary_.keys())

Unnamed: 0,go,until,jurong,point,crazi,avail,onli,in,bugi,great,...,dental,nmde,dump,heap,salesman,å750,087187272008,now1,piti,soani
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Features Extraction by Counting
vec = CountVectorizer()
featuresByCounting = vec.fit_transform(df['sms'])
pd.DataFrame(featuresByCounting.toarray(),columns=vec.vocabulary_.keys())

Unnamed: 0,go,until,jurong,point,crazi,avail,onli,in,bugi,great,...,dental,nmde,dump,heap,salesman,å750,087187272008,now1,piti,soani
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Features Extraction by TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vec = TfidfVectorizer()
featuresByTfIDF= tf_vec.fit_transform(df['sms'])
pd.DataFrame(featuresByTfIDF.toarray(),columns=tf_vec.vocabulary_.keys())

Unnamed: 0,go,until,jurong,point,crazi,avail,onli,in,bugi,great,...,dental,nmde,dump,heap,salesman,å750,087187272008,now1,piti,soani
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.366692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#splitting data to training data and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(featuresByCounting, df['class'], test_size=0.2)

In [26]:
print(x_train.shape)
print(x_test.shape)

(4457, 8156)
(1115, 8156)


In [27]:
#Training data
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [28]:
#Testing data
import numpy as np
predictedOutput = model.predict(x_test)
#  print(np.mean(predictedOutput == y_test))
# print(predictedOutput)


In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictedOutput)

0.979372197309417

In [30]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,predictedOutput))

[[960   9]
 [ 14 132]]


##### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [44]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [45]:
def output_label(n):
    if n == 0:
        return "Spam"
    elif n == 1:
        return "Not Spam"

In [46]:
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output_lable(pred_LR[0]), 
                                                                                                              output_lable(pred_DT[0]), 
                                                                                                              output_lable(pred_GBC[0]), 
                                                                                                              output_lable(pred_RFC[0])))

In [47]:
manual_testing("it's free to you")

NameError: name 'vectorization' is not defined