## Import Libraries

In [1]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import random
import matplotlib.pyplot as plt
import glob
from textblob import TextBlob
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk


In [2]:
df = pd.read_csv('/home/aniruddha/Downloads/spam.csv', encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Remove unnecessary column
col = ['v1', 'v2']
df = df[col]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Rename column
df.columns = ['class','msg']
df.head(10)

Unnamed: 0,class,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
# Check for null values
df[pd.isnull(df['msg'])]

Unnamed: 0,class,msg


In [7]:
#df = df[pd.notnull(df['msg'])]

In [9]:
#df['class'].factorize()[0]

In [10]:
# factorize class column
df['class'] = df['class'].factorize()[0]
df.head()

Unnamed: 0,class,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Keep Alphabets

In [11]:
import re

def keep_alphabete(input_text):
    alpha_text = re.sub('[^A-Za-z]', ' ',input_text)
    return alpha_text

df['clean'] = df['msg'].apply(keep_alphabete)

In [12]:
df.head()

Unnamed: 0,class,msg,clean
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...


## Convert Case 

In [13]:
def convert_case(input_text):
    return input_text.lower()
df['clean'] = df['clean'].apply(convert_case)

In [14]:
df.head()

Unnamed: 0,class,msg,clean
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...


## Remove Stopwords

In [15]:
stopwords_list = list(set(stopwords.words('english')))

In [16]:
def remove_stopwords(input_text):
    tokens = input_text.split()
    clean_tokens = []
    for word in tokens:
        if word not in stopwords_list:
            clean_tokens.append(word)
    return ' '.join(clean_tokens)        


df['clean'] = df['clean'].apply(remove_stopwords)

In [17]:
df.head()

Unnamed: 0,class,msg,clean
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


## Remove Small Words

In [18]:
def remove_small_words(input_string):
    tokens = input_string.split()
    clean_tokens = []
    for word in tokens:
        if len(word) >= 3:
            clean_tokens.append(word)
    return ' '.join(clean_tokens)

df['clean'] = df['clean'].apply(remove_small_words)

In [19]:
df.head()

Unnamed: 0,class,msg,clean
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...
1,0,Ok lar... Joking wif u oni...,lar joking wif oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...
3,0,U dun say so early hor... U c already then say...,dun say early hor already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


## Stemmeing 

In [20]:
stemmer = PorterStemmer()

def stemming(input_text):
    tokens = input_text.split()
    stemmed = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed)

df['stem'] = df['clean'].apply(stemming)

## Lemmatization 

In [21]:
lemmatizer = WordNetLemmatizer()

def lemmatize(input_text):
    tokens = input_text.split()
    lemmaa = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmaa)

df['lemmaa'] = df['clean'].apply(lemmatize)

In [22]:
df.head()

Unnamed: 0,class,msg,clean,stem,lemmaa
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis great world...,jurong point crazi avail bugi great world buff...,jurong point crazy available bugis great world...
1,0,Ok lar... Joking wif u oni...,lar joking wif oni,lar joke wif oni,lar joking wif oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win cup final tkts may te...,free entri wkli comp win cup final tkt may tex...,free entry wkly comp win cup final tkts may te...
3,0,U dun say so early hor... U c already then say...,dun say early hor already say,dun say earli hor alreadi say,dun say early hor already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,nah think goe usf live around though,nah think go usf life around though


## CountVectorizer 

In [23]:
corpus = df['stem'].tolist()

In [24]:
data = corpus

In [25]:
#CountVectorizer

vec = CountVectorizer(max_features=3000)
x = vec.fit_transform(data).toarray()   #tdm (every row is document)

In [26]:
# vec.get_feature_names()
y = df.iloc[:,0]

In [27]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
y_pred = classifier.predict(x_test)

In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test,y_pred)

In [32]:
print('Accuracy:',accuracy)

Accuracy: 0.9730861244019139
