#Spam Mail Prediction using Machine Learning 


In [2]:
#Importing Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Data Collection & Preprocessing

In [3]:
#Loading data into dataframe by using read CSV function

raw_data = pd.read_csv('/content/drive/MyDrive/ML Datasets/mail_data.csv')

In [4]:
raw_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
#To deal with missing data , we are gonna replace with NULL string

data = raw_data.where((pd.notnull(raw_data)),'')

In [9]:
#Printing first 5 values
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#Checking number of rows & columns
data.shape

(5572, 2)

#Label Encoding

In [11]:
#Labelling "Spam" as 0 , "Ham" as 1

data.loc[data['Category'] == 'spam','Category',] = 0

data.loc[data['Category'] == 'ham','Category',] = 1

#Splitting features & label

In [12]:
x = data['Message']

y = data['Category']

In [13]:
print (x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [14]:
print (y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


#Splitting data into train & test data

In [16]:
#Using train_test_split function , we are splitting data with test data size of 20%
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size = 0.2 , random_state = 3)

In [17]:
#Checking split 
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(4457,) (1115,) (4457,) (1115,)


#Features extraction 

In [19]:
#Transforming text values into numbers (features) 
f = TfidfVectorizer(min_df = 1,stop_words = 'english', lowercase = 'True')

x_train_features = f.fit_transform(x_train)
x_test_features = f.transform(x_test)

#Converting y_train & y_test values into numbers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [21]:
x_train_features

<4457x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 34775 stored elements in Compressed Sparse Row format>

In [23]:
x_test_features

<1115x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 7687 stored elements in Compressed Sparse Row format>

#Model Training

Logistic Regression

In [24]:
#Creating instance of logistic regression

reg = LogisticRegression()

In [25]:
#Fitting model

reg.fit(x_train_features , y_train)

LogisticRegression()

#Model Evaluation

In [26]:
#Performance evaluation on Training data

train_data_prediction = reg.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train , train_data_prediction)
print(accuracy_on_training_data)

0.9670181736594121


In [27]:
#Performance evaluation on Test data

test_data_prediction = reg.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test , test_data_prediction)
print(accuracy_on_test_data)

0.9659192825112107


#Building Prediction System

In [32]:
input = ["URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"]

#Convert text into numbers 

input_data = f.transform(input)

#Finding prediction value 

output = reg.predict(input_data)
print(output)

if output[0] == 1:
  print ("Not a Spam mail")
else :
  print ("Spam mail")

[0]
Spam mail
