In [85]:
#importing necessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [86]:
# Data Path
data_file='./spam.csv'

In [87]:
#To find encoding pattern of the file
import chardet
with open(data_file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.7272080023536335, 'language': ''}

In [88]:
# Importing File
data_frame=pd.read_csv(data_file,encoding='Windows-1252')

In [89]:
# Understanding the dimensions of data
data_frame.shape

(5572, 5)

In [90]:
# Understanding the Data Variables
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [91]:
# List the columns of dataset
data_frame.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [92]:
# Show the top 10 Rows of data
data_frame.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


##### * Column v2 contains texts of emails and v1 contains whether those emails are spam or not

In [93]:
# Checking for any Duplicated Entries
data_frame.duplicated().sum()

np.int64(403)

In [94]:
# Dropping the duplicate values
data_frame.drop_duplicates(keep = 'first', inplace = True)
data_frame.shape

(5169, 5)

In [95]:
#Detecting missing values
data_frame.isna().sum()

v1               0
v2               0
Unnamed: 2    5126
Unnamed: 3    5159
Unnamed: 4    5164
dtype: int64

In [96]:
#Dropping unwanted Columns from data
data_frame.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True )
data_frame.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [97]:
# Finding the number of spam and non-spam sms in dataset
data_frame['v1'].value_counts()

v1
ham     4516
spam     653
Name: count, dtype: int64

In [98]:
# Printing examples of ham messages
print("Some examples of ham messages:")
print(data_frame[data_frame['v1'] == 'ham']['v2'].head())

Some examples of ham messages:
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
6    Even my brother is not like to speak with me. ...
Name: v2, dtype: object


In [99]:
# Printing examples of spam messages
print(" Some examples of spam messages:")
print(data_frame[data_frame['v1'] == 'spam']['v2'].head())

 Some examples of spam messages:
2     Free entry in 2 a wkly comp to win FA Cup fina...
5     FreeMsg Hey there darling it's been 3 week's n...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
11    SIX chances to win CASH! From 100 to 20,000 po...
Name: v2, dtype: object


In [100]:
# Labelling Spam sms as 1 and Non-Spam sms as 0
data_frame['v1'] = np.where(data_frame['v1'] == 'spam',1,0)
data_frame

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [101]:
from sklearn.model_selection import train_test_split

In [102]:
#First step in building the model is to identify the Feature(Input) variables and Target (Output) variable
#Features are the sms content
#Target represents the labels for spam and non-spam sms
features = data_frame['v2']
target = data_frame['v1']

In [103]:
# Splitting data for training the model and testing the model
# train size taken as 0.8
X_train, X_test, y_train, y_test = train_test_split(features, target, train_size = .8)
# Dimensions of Train and Test Data sets
print('Train set of features: ', X_train.shape)
print('Test set of features: ', X_test.shape)
print('Target for train: ', y_train.shape)
print('Target for test: ', y_test.shape)

Train set of features:  (4135,)
Test set of features:  (1034,)
Target for train:  (4135,)
Target for test:  (1034,)


In [104]:
from sklearn.feature_extraction.text import CountVectorizer

In [105]:
#CountVectorizer transform text into a vector on the basis of the frequency/count of each word that occurs in entire text.
#Covert each word in Train dataset into vectors for using them in further text analysis.
cv=CountVectorizer()
X_train_vector=cv.fit_transform(X_train.values)
X_train_vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [106]:
#Coverting each word in Test dataset into vectors for using them in further text analysis.
X_test_vector=cv.transform(X_test.values)
X_test_vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [107]:
# Here Multinomial Naive Bayes model which is a supervised learning classification
#Used for the analysis of the categorical text data.
from sklearn.naive_bayes import MultinomialNB
my_model_NB=MultinomialNB()
# Fitting the model in train data set ie the MultinomialNB Model should learn from the Train Data
my_model_NB.fit(X_train_vector, y_train)

In [108]:
# Predicting whether the mails in Test Dataset are spam or non-spam using our model from Feature Test values
y_pred_NB = my_model_NB.predict(X_test_vector)
y_pred_NB

array([1, 0, 0, ..., 0, 0, 0])

In [109]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [110]:
# find Accurancy Score = (TP+TN)/(TP+TN+FP+FN)
accuracy_score(y_test, y_pred_NB)

0.9806576402321083

In [111]:
# find Precision Score= TP/(TP+FP)
precision_score(y_test, y_pred_NB)

np.float64(0.9621212121212122)

In [112]:
# Recall = TP/(TP+FN)
# Recall gives the percentage of positives well predicted by our model.
recall_score(y_test, y_pred_NB)

np.float64(0.8943661971830986)

In [113]:
# f1_score= (Precition * Recall )/ (Precition + Recall)
f1_score(y_test, y_pred_NB)

np.float64(0.927007299270073)

In [114]:
from sklearn.svm import SVC
my_model_SVC= SVC()
# Fitting the model in train data set ie SVC Model should learn from the on Train Data
my_model_SVC.fit(X_train_vector, y_train)

In [115]:
# Predicting whether the sms in Test Dataset are spam or non-spam using our model from Feature Test values
y_pred_SVC = my_model_SVC.predict(X_test_vector)
y_pred_SVC

array([1, 0, 0, ..., 0, 1, 0])

In [116]:
# find Accurancy Score = (TP+TN)/(TP+TN+FP+FN)
accuracy_score(y_test, y_pred_SVC)

0.971953578336557

In [117]:
# find Precision Score= TP/(TP+FP)
precision_score(y_test, y_pred_SVC)

np.float64(0.991304347826087)

In [118]:
# Recall = TP/(TP+FN)
# Recall gives the percentage of positives well predicted by our model.
recall_score(y_test, y_pred_SVC)

np.float64(0.8028169014084507)

In [119]:
# f1_score= (Precition * Recall )/ (Precition + Recall)
f1_score(y_test, y_pred_SVC)

np.float64(0.8871595330739299)

In [120]:
# Learn the Logistic Regression Model on train data
from sklearn.linear_model import LogisticRegression
my_model_LR=LogisticRegression()
# Fitting the model in train data set ie the LogisticRegression should learn from the Train Data
my_model_LR.fit(X_train_vector, y_train)

In [121]:
#predicting using LogisticRegression
y_pred_LR = my_model_LR.predict(X_test_vector)
y_pred_LR

array([1, 0, 0, ..., 0, 1, 0])

In [122]:
# find Accurancy Score = (TP+TN)/(TP+TN+FP+FN)
accuracy_score(y_test, y_pred_LR)

0.9671179883945842

In [123]:
# find Precision Score= TP/(TP+FP)
precision_score(y_test, y_pred_LR)

np.float64(0.9576271186440678)

In [124]:
# Recall = TP/(TP+FN)
# Recall gives the percentage of positives well predicted by our model.
recall_score(y_test, y_pred_LR)

np.float64(0.795774647887324)

In [125]:
# f1_score= (Precition * Recall )/ (Precition + Recall)
f1_score(y_test, y_pred_LR)

np.float64(0.8692307692307693)

In [126]:
from joblib import dump, load
dump(my_model_SVC, 'spam_svc.joblib')
dump(my_model_NB, 'spam_nb.joblib')
dump(my_model_LR, 'spam_lr.joblib')

['spam_lr.joblib']