<a href="https://colab.research.google.com/github/ajinkya3112/Machine-Learning-Projects/blob/main/Spam_Mail_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Pre-Processing

In [None]:
# loading the dataset to a pandas DataFrame
mail_dataset = pd.read_csv('/content/mail_data.csv')

In [None]:
mail_dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_dataset.shape

(5572, 2)

In [None]:
mail_dataset.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
mail_dataset.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
mail_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# Seprating the and label
X = mail_dataset.drop(columns = 'Category', axis = 1)
Y = mail_dataset['Category']

In [None]:
print(X)

                                                Message
0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup fina...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
...                                                 ...
5567  This is the 2nd time we have tried 2 contact u...
5568               Will ü b going to esplanade fr home?
5569  Pity, * was in mood for that. So...any other s...
5570  The guy did some bitching but I acted like i'd...
5571                         Rofl. Its true to its name

[5572 rows x 1 columns]


In [None]:
print(Y)

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object


Stemming:

Stemming is the process of reducing a word to its Root Word

Example:

actor, actress, acting --> act

In [None]:
import re

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(Message):
  stemmed_message = re.sub('[^a-zA-Z]',' ' ,Message)
  stemmed_message = stemmed_message.lower()
  stemmed_message = stemmed_message.split()
  stemmed_message = [port_stem.stem(word) for word in stemmed_message if not word in stopwords.words('english')]
  stemmed_message = ' '.join(stemmed_message)
  return stemmed_message

In [None]:
mail_dataset['Message'] = mail_dataset['Message'].apply(stemming)

In [None]:
print(mail_dataset['Message'])

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkt st m...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    nd time tri contact u u pound prize claim easi...
5568                                b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: Message, Length: 5572, dtype: object


In [None]:
# separating the data and label
X = mail_dataset['Message'].values
Y = mail_dataset['Category'].values

In [None]:
print(X)
print(Y)

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat'
 'ok lar joke wif u oni'
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli'
 ... 'piti mood suggest'
 'guy bitch act like interest buy someth els next week gave us free'
 'rofl true name']
['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [None]:
X.shape

(5572,)

In [None]:
# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

  (0, 190)	0.3522946643655987
  (0, 379)	0.26350491969128115
  (0, 736)	0.33630333732147566
  (0, 738)	0.29761995607435426
  (0, 964)	0.29761995607435426
  (0, 1169)	0.27282796669086984
  (0, 2171)	0.14066343975170745
  (0, 2208)	0.1649859743034801
  (0, 2245)	0.19460776670194488
  (0, 2827)	0.3522946643655987
  (0, 2932)	0.28506031120996994
  (0, 4091)	0.24055424511726686
  (0, 5957)	0.19460776670194488
  (0, 6135)	0.23616756554565888
  (1, 2794)	0.4745440766926726
  (1, 2960)	0.4218982744467187
  (1, 3760)	0.2809319560263009
  (1, 3785)	0.564793662023427
  (1, 6056)	0.44597659211687757
  (2, 262)	0.18752116579572622
  (2, 1058)	0.2181159425903744
  (2, 1220)	0.22327647280120547
  (2, 1673)	0.3983526060107063
  (2, 1791)	0.52682621884254
  (2, 1890)	0.18841663063918468
  :	:
  (5567, 5520)	0.20176693864555295
  (5567, 5644)	0.23763296461255506
  (5568, 1704)	0.6652366917601374
  (5568, 1996)	0.5740672391289212
  (5568, 2171)	0.29597505521175127
  (5568, 2457)	0.37457404553349233
  (55

Splitting the data to train and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(5572, 6296) (4457, 6296) (1115, 6296)


Training the model: logeistic Regression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

Evaluation

Accuracy Score

In [None]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of the training data : ' , training_data_accuracy)

Accuracy score of the test data :  0.9715054969710568


In [None]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of the test data : ' , test_data_accuracy)

Accuracy score of the test data :  0.9650224215246637


Maaking a Predictive System

In [None]:
X_new = X_test[1113]

prediction = model.predict(X_new)
print(prediction)

if prediction[0] == 'ham':
  print('This mail is real')
else:
  print('This mail is spam')

['spam']
This mail is spam


In [None]:
Y_test[500]

'ham'

In [None]:
print(Y_test)

['ham' 'ham' 'ham' ... 'ham' 'spam' 'ham']


In [None]:
Y_test.shape

(1115,)