# import the libraries 

In [69]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string

In [70]:
#read the csv file
df = pd.read_csv('spam.csv',encoding = "ISO-8859-1")
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [71]:
df.shape

(5572, 5)

In [72]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [73]:
#remove duplicated
df.drop_duplicates(inplace = True)

In [74]:
#see number of raw after remove
df.shape

(5169, 5)

# Data pre Processing

In [75]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5126
Unnamed: 3    5159
Unnamed: 4    5164
dtype: int64

In [76]:
df.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"],inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [77]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [78]:
#feature encoding
df['v1']=df['v1'].map({'spam':1,'ham':0})
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [None]:
def process_text(text):
    
    #1 Remove Punctuationa
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2 Remove Stop Words
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3 Return a list of clean words
    return clean_words

In [79]:
df['v2'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: v2, dtype: object

# select target variable and split for train and test

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
x = CountVectorizer(analyzer=process_text).fit_transform(df['v2'])
x

<5169x11304 sparse matrix of type '<class 'numpy.int64'>'
	with 45872 stored elements in Compressed Sparse Row format>

In [83]:
y=df['v1']
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5169, dtype: int64

In [84]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [85]:
x.shape

(5169, 11304)

In [86]:
#Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x_train, y_train)

In [89]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
#evalute on the training
pred = classifier.predict(x_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3627
           1       0.98      0.98      0.98       508

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135

Confusion Matrix: 
 [[3619    8]
 [   9  499]]

Accuracy:  0.9958887545344619


In [91]:
#evalute on testing
pred = classifier.predict(x_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       889
           1       0.83      0.94      0.88       145

    accuracy                           0.97      1034
   macro avg       0.91      0.96      0.93      1034
weighted avg       0.97      0.97      0.97      1034

Confusion Matrix: 
 [[861  28]
 [  8 137]]

Accuracy:  0.965183752417795


In [92]:
#Logistic regression classifier
from sklearn.linear_model import LogisticRegression
LR_classifier = LogisticRegression(random_state=42)
LR_classifier.fit(x_train, y_train)

In [94]:
#evalute on the training
pred = LR_classifier.predict(x_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3627
           1       1.00      0.96      0.98       508

    accuracy                           1.00      4135
   macro avg       1.00      0.98      0.99      4135
weighted avg       1.00      1.00      1.00      4135

Confusion Matrix: 
 [[3627    0]
 [  20  488]]

Accuracy:  0.9951632406287787


In [95]:
#evalute on testing
pred = LR_classifier.predict(x_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       889
           1       0.98      0.81      0.89       145

    accuracy                           0.97      1034
   macro avg       0.97      0.91      0.94      1034
weighted avg       0.97      0.97      0.97      1034

Confusion Matrix: 
 [[886   3]
 [ 27 118]]

Accuracy:  0.9709864603481625


In [98]:
#support vector machine classifier
from sklearn import svm
svm_classifier = svm.SVC()
svm_classifier.fit(x_train, y_train)

In [99]:
#evalute on the training
pred = classifier.predict(x_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3627
           1       0.98      0.98      0.98       508

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135

Confusion Matrix: 
 [[3619    8]
 [   9  499]]

Accuracy:  0.9958887545344619


In [100]:
#evalute on testing
pred = classifier.predict(x_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       889
           1       0.83      0.94      0.88       145

    accuracy                           0.97      1034
   macro avg       0.91      0.96      0.93      1034
weighted avg       0.97      0.97      0.97      1034

Confusion Matrix: 
 [[861  28]
 [  8 137]]

Accuracy:  0.965183752417795
