# **SPAM Classification**

Checking the spam classification of Email whether mail is spam or ham using machine learning techniques.

using dataset from uci repository: https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

In [7]:
# Importing required libraries for spam classification
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  


In [10]:
# reading dataset

message = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','message'], encoding='latin1')

In [11]:
message

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [12]:
# getting the info of the dataset
message.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [18]:
# checking and fitlering out with missing values
data = message.where((pd.notnull(message)), '')

In [17]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
# counting the values spam and ham categories
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [21]:
# checking the shape
data.shape

(5572, 2)

In [22]:
# changing the replacing spam and ham with onehot encoding
data['label'] = data['label'].map({"spam":0,"ham":1})

In [24]:
# now lets see data once again
data

Unnamed: 0,label,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ã¼ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [26]:
# taking the variables x and y (dependent and independent) with message and label

X = data['message']
y = data['label']

In [27]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: label, Length: 5572, dtype: int64

In [28]:
# spliting the dataset Train and Test

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=3)

In [32]:
# checking the shape of each Train Test Split dataset

print(X_train.shape,
      X_test.shape, 
      y_train.shape,
      y_test.shape)

(4457,) (1115,) (4457,) (1115,)


In [33]:
# tranformaing categorical to numerical dataset

categorical_to_numerical = TfidfVectorizer(min_df=1,stop_words='english', lowercase=True)

In [35]:
# new Xtrain dataset and check
new_Xtrain = categorical_to_numerical.fit_transform(X_train)
new_Xtrain

<4457x7470 sparse matrix of type '<class 'numpy.float64'>'
	with 34950 stored elements in Compressed Sparse Row format>

In [37]:
# new Xtest dataset and check
new_Xtest = categorical_to_numerical.transform(X_test)
new_Xtest

<1115x7470 sparse matrix of type '<class 'numpy.float64'>'
	with 7713 stored elements in Compressed Sparse Row format>

In [38]:
# Preparing the model

lr = LogisticRegression()

lr.fit(new_Xtrain,y_train)

In [42]:
# score of the model
lr.score(new_Xtrain,y_train)

0.9679156383217411

In [43]:
# prediction of the model based of trainig 
y_pred = lr.predict(new_Xtest)
y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [44]:
# checking the shape to new_Xtest dataset
new_Xtest.shape

(1115, 7470)

In [47]:
# predicting the value of specific sets no. data
lr.predict(new_Xtest[1110])

array([1])

In [49]:
# Model Evaluation
# using confusion matric True (+ve, -ve), False (+ve, -ve)

from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

[[116  39]
 [  0 960]]


In [60]:
result = {}

In [61]:
result['LogisticRegression'] = lr.score(new_Xtest, y_test)

In [62]:
result['LogisticRegression']

0.9650224215246637

In [67]:
# Preparing all the Classification Alogrithms

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# assigning the vairables

lr = LogisticRegression()
rfc = RandomForestClassifier(n_estimators=50, max_depth=5,random_state=1)
dtc = DecisionTreeClassifier()
# nb = GaussianNB()
# knbc = KNeighborsClassifier()
nn = MLPClassifier(hidden_layer_sizes=(18,), random_state=1, max_iter= 1000)

# fit the models

lr.fit(new_Xtrain, y_train)
rfc.fit(new_Xtrain, y_train)
dtc.fit(new_Xtrain, y_train)
# nb.fit(new_Xtrain, y_train)
# knbc.fit(new_Xtrian, y_train)
nn.fit(new_Xtrain, y_train)

# results preparation

result['LogisticRegression'] = lr.score(new_Xtest,y_test)
result['RandomForestClassifier'] = lr.score(new_Xtest, y_test)
result['DecisionTreeClassifier'] = dtc.score(new_Xtest, y_test)
# result['GaussianNB'] = nb.score(new_Xtest, y_test)
# result['KNeighborsClassifier'] = knbc.score(new_Xtest, y_test)
result['MLPClassifier'] = nn.score(new_Xtest,y_test)

In [69]:
# Tabluating the above result 
# import library tabulate
from tabulate import tabulate

# result in all on dictionary
result = dict(sorted(result.items(), key=lambda x:x[1],reverse=True))
# Print out all the results
print(tabulate(result.items(), headers=['NAME','VALUE'], tablefmt='grid'))

+------------------------+----------+
| NAME                   |    VALUE |
| MLPClassifier          | 0.98565  |
+------------------------+----------+
| LogisticRegression     | 0.965022 |
+------------------------+----------+
| RandomForestClassifier | 0.965022 |
+------------------------+----------+
| DecisionTreeClassifier | 0.964126 |
+------------------------+----------+


From the above result it's seen that neural network has better accuracy 98% in terms of classification than the mother having 96% 