In [1]:
#import all the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#read the data
raw_mail = pd.read_csv("spam.csv",encoding='utf-8')
raw_mail.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
# Replace the null values with a null string
mail_data = raw_mail.where(pd.notnull(raw_mail),'')

In [4]:
mail_data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [5]:
mail_data.shape

(5572, 5)

In [6]:
#  Label Encoding as machine learning models can work with numerical values not objects

mail_data.loc[mail_data['v1'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['v1'] == 'ham', 'Category'] = 1

In [7]:
mail_data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,Category
0,ham,"Go until jurong point, crazy.. Available only ...",,,,1.0
1,ham,Ok lar... Joking wif u oni...,,,,1.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,0.0
3,ham,U dun say so early hor... U c already then say...,,,,1.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,1.0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,,0.0
6,ham,Even my brother is not like to speak with me. ...,,,,1.0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,,1.0
8,spam,WINNER!! As a valued network customer you have...,,,,0.0
9,spam,Had your mobile 11 months or more? U R entitle...,,,,0.0


In [8]:
# Seperating the text as texts and label
X = mail_data['v2']
Y = mail_data['v1']

In [9]:
X.head(10)

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: v2, dtype: object

In [10]:
Y.head(10)

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: v1, dtype: object

In [11]:
X_Train,X_test,Y_Train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

In [12]:
print("X_train original shape", X_Train.shape)
print("Y_train original shape", Y_Train.shape)
print("X_test original shape", X_test.shape)
print("Y_test original shape", Y_test.shape)

X_train original shape (4457,)
Y_train original shape (4457,)
X_test original shape (1115,)
Y_test original shape (1115,)


In [13]:
# Feature Extraction 
# Transform text data to feature vectors that can be used as input to the logistic regression
feature_extraction = TfidfVectorizer()

In [14]:
X_train_feature = feature_extraction.fit_transform(X_Train)
X_test_feature = feature_extraction.transform(X_test)

In [15]:
print(X_train_feature)

  (0, 741)	0.28307455118083463
  (0, 3360)	0.1327523238442287
  (0, 4108)	0.21196015023008544
  (0, 4908)	0.12973225055917514
  (0, 3042)	0.26300555749396887
  (0, 946)	0.1151770452043338
  (0, 7464)	0.19261204588580316
  (0, 4431)	0.3421657916670175
  (0, 6805)	0.17846848640014898
  (0, 6873)	0.15216101010779184
  (0, 3497)	0.28307455118083463
  (0, 2178)	0.33952544349598
  (0, 3235)	0.3869898904365042
  (0, 3365)	0.22753426247664568
  (0, 1032)	0.1361275560580212
  (0, 7720)	0.1765620792792692
  (0, 3491)	0.19174855251416806
  (0, 4655)	0.2558426236041184
  (1, 4190)	0.3725861907992424
  (1, 7099)	0.42172200036894236
  (1, 6620)	0.46707907862382136
  (1, 3646)	0.2020333473623602
  (1, 6645)	0.553594666958471
  (1, 7704)	0.3433404875792393
  (2, 954)	0.4257390912308466
  :	:
  (4455, 7402)	0.15130978849620824
  (4455, 6316)	0.16857953235415776
  (4455, 6850)	0.14840315498751144
  (4455, 1592)	0.11611242093594701
  (4455, 6991)	0.1743411711262433
  (4455, 4647)	0.1711510506728716
  (44

In [16]:
model = LogisticRegression()

In [17]:
model.fit(X_train_feature,Y_Train)

In [18]:
# Evaluating the Trained Model
# Predition on Training Model
prediction_on_Training_Data = model.predict(X_train_feature)
accuracy_on_training_data = accuracy_score(Y_Train,prediction_on_Training_Data)

In [19]:
print("Accuracy for Training : ",accuracy_on_training_data * 100)

Accuracy for Training :  97.39735247924612


In [20]:
# Predict on Test Data
prediction_on_Test_Data = model.predict(X_test_feature)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_Test_Data)

In [21]:
print("Accuracy for Training : ",accuracy_on_test_data * 100)

Accuracy for Training :  97.57847533632287


In [22]:
#  Building a Predictable System
input_mail = ["As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589"]

# Convert Text to feature vectors
input_data_feature = feature_extraction.transform(input_mail)

# Making Prediction
prediction = model.predict(input_data_feature)

print(prediction)

if(prediction == [1]):
    print("This is the Ham Mail.")
else:
    print("This is the Spam Mail.")

['spam']
This is the Spam Mail.
