In [1]:
# Spam Mail Prediction Using Machine Learning 

##### Importing Libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample


##### Data Collection & Pre-Processing

In [5]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('spam_updated.csv')

In [6]:
print(raw_mail_data)

        v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...    ...                                                ...        ...   
8037  spam  Survey Scam: Take this survey and win free gif...        NaN   
8038  spam  Fake Promotion: Get an iPhone 15 for just $99!...        NaN   
8039  spam  Lottery Scam: You have won $1,000,000! Click t...        NaN   
8040  spam  Phishing: Your bank account is at risk! Login ...        NaN   
8041  spam  Subscription Scam: Your Netflix account will b...        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN        NaN  
1           NaN        NaN  


In [7]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [8]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(8042, 5)

##### Label Encoding

In [11]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['v1'] == 'spam', 'v1',] = 0
mail_data.loc[mail_data['v1'] == 'ham', 'v1',] = 1

In [12]:
# separating the data as texts and label

X = mail_data['v2']

Y = mail_data['v1']

In [13]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
8037    Survey Scam: Take this survey and win free gif...
8038    Fake Promotion: Get an iPhone 15 for just $99!...
8039    Lottery Scam: You have won $1,000,000! Click t...
8040    Phishing: Your bank account is at risk! Login ...
8041    Subscription Scam: Your Netflix account will b...
Name: v2, Length: 8042, dtype: object


In [14]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
8037    0
8038    0
8039    0
8040    0
8041    0
Name: v1, Length: 8042, dtype: object


##### Splitting the data into training data & test data

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [17]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(8042,)
(6433,)
(1609,)


##### Feature Extraction

In [19]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [20]:
print(X_train)

6245    Lottery Scam: You have won $1,000,000! Click t...
5069    5p 4 alfie Moon's Children in need song on ur ...
2629    Hey there! Glad u r better now. I hear u treat...
439       But i have to. I like to have love and arrange.
1103    Aiyah sorry lor... I watch tv watch until i fo...
                              ...                        
7096    Bitcoin Scam: Invest $100 today and get $10,00...
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
5994    Subscription Scam: Your Netflix account will b...
Name: v2, Length: 6433, dtype: object


In [21]:
print(X_train_features)

  (0, 1756)	0.3543737603961457
  (0, 1777)	0.38037573059889046
  (0, 1)	0.6338451422925622
  (0, 7277)	0.35552060708643435
  (0, 5708)	0.23476758758633037
  (0, 4064)	0.38270504943358796
  (1, 5231)	0.20454328916484657
  (1, 72)	0.20454328916484657
  (1, 7422)	0.19088888490644867
  (1, 5089)	0.19465010062703003
  (1, 5084)	0.17397636739358255
  (1, 4628)	0.20454328916484657
  (1, 630)	0.17071825413911432
  (1, 1674)	0.6136298674945397
  (1, 6701)	0.15305492030513007
  (1, 6824)	0.11979082044654871
  (1, 4126)	0.20454328916484657
  (1, 6517)	0.12528276692480406
  (1, 4377)	0.16186350338822963
  (1, 6925)	0.20808081704567133
  (1, 6076)	0.1821861571093143
  (1, 4556)	0.11941543450168127
  (1, 1721)	0.18475691208921355
  (1, 4417)	0.18475691208921355
  (1, 893)	0.20454328916484657
  :	:
  (6429, 5315)	0.4288021882328694
  (6429, 2088)	0.38988083340956
  (6429, 3074)	0.35105478599979373
  (6429, 7292)	0.2444102173276999
  (6430, 1150)	0.3868850251554947
  (6430, 6376)	0.3868850251554947
  

### Training Model

##### Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

### Evaluating The Trained Model

In [27]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [28]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9805689413959272


In [29]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [30]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9776258545680547


### Building A Predictive System

In [32]:
input_mail = ["Lottery Scam: You have won $1,000,000! C"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')
  

[0]
Spam mail


In [33]:
import joblib

# Save the trained TF-IDF vectorizer
joblib.dump(feature_extraction, "vectorizer.pkl")

# Save the trained Logistic Regression model
joblib.dump(model, "spam_model.pkl")

print("Model and vectorizer saved successfully!")



Model and vectorizer saved successfully!
