# Spam Mail Prediction Using Machine Learning

## Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pickle

## Data Loading

In [2]:
df = pd.read_csv("mail_data.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,not spam,"Go until jurong point, crazy.. Available only ..."
1,not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,not spam,U dun say so early hor... U c already then say...
4,not spam,"Nah I don't think he goes to usf, he lives aro..."


#### The dataset has been converted from a csv file into a pandas dataframe

## Data Preprocessing

### Analyse the number of rows and columns

In [4]:
df.shape

(5572, 2)

#### Number of rows : 5572
#### Number of columns : 2

### Check for null values

In [5]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

#### As we can see there are no null values

### Data Encoding

In [6]:
le = LabelEncoder()
df["Category"] = le.fit_transform(df["Category"])

In [7]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#### Spam -> 1
#### Not Spam -> 0

### Separating the data as text and label

In [8]:
X = df["Message"]
y = df["Category"]

In [9]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [10]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int32


## Train/Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3)

### Checking the split

In [12]:
len(X_train)

4457

In [13]:
len(X_test)

1115

#### As we can clearly see, the data has been split. 80% i.e. 4457 -> Training Data & 20% i.e. 1115 -> Testing Data

## Feature Extraction

### Tranforming textual data to vectors

In [14]:
v = TfidfVectorizer(min_df = 1, stop_words = "english" , lowercase = True)

In [15]:
X_train_features = v.fit_transform(X_train)
X_test_features = v.transform(X_test)

### Checking the transformation

In [16]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [17]:
X_train_features.shape

(4457, 7431)

In [18]:
print(X_test_features)

  (0, 7271)	0.1940327008179069
  (0, 6920)	0.20571591693537986
  (0, 5373)	0.2365698724638063
  (0, 5213)	0.1988547357502182
  (0, 4386)	0.18353336340308998
  (0, 1549)	0.2646498848307188
  (0, 1405)	0.3176863938914351
  (0, 1361)	0.25132445289897426
  (0, 1082)	0.2451068436245027
  (0, 1041)	0.28016206931555726
  (0, 405)	0.2381316303003606
  (0, 306)	0.23975986557206702
  (0, 20)	0.30668032384591537
  (0, 14)	0.26797874471323896
  (0, 9)	0.2852706805264544
  (0, 1)	0.2381316303003606
  (1, 7368)	0.29957800964520975
  (1, 6732)	0.42473488678029325
  (1, 6588)	0.3298937975962767
  (1, 6507)	0.26731535902873493
  (1, 6214)	0.3621564482127515
  (1, 4729)	0.22965776503163893
  (1, 4418)	0.3457696891316818
  (1, 3491)	0.496093956101028
  (2, 7205)	0.22341717215670331
  :	:
  (1110, 3167)	0.5718357066163949
  (1111, 7353)	0.4991205841293424
  (1111, 6787)	0.40050175714278885
  (1111, 6033)	0.4714849709283488
  (1111, 3227)	0.44384935772735523
  (1111, 2440)	0.4137350055985486
  (1112, 7071)

In [19]:
X_test_features.shape

(1115, 7431)

## Train the Model

### Logistic Regression

In [20]:
model_1 = LogisticRegression()

In [21]:
model_1.fit(X_train_features, y_train)

### Decision Tree

In [22]:
model_2 = DecisionTreeClassifier()

In [23]:
model_2.fit(X_train_features, y_train)

## Model Evaluation

### Logistic Regression

### Accuracy on Training Data

In [24]:
training_pred = model_1.predict(X_train_features)
training_accuracy = accuracy_score(y_train, training_pred)

print(f"Accuracy score on Training Data : {round((training_accuracy * 100), 2)}%")

Accuracy score on Training Data : 96.77%


### Accuracy on Testing Data

In [25]:
testing_pred = model_1.predict(X_test_features)
testing_accuracy = accuracy_score(y_test, testing_pred)

print(f"Accuracy score on Testing Data : {round((testing_accuracy * 100), 2)}%")

Accuracy score on Testing Data : 96.68%


#### There is not much difference in the accuracy score of both training and testing data (No overfitting/underfitting problem)

### Decision Tree

### Accuracy on Training Data

In [26]:
training_pred = model_2.predict(X_train_features)
training_accuracy = accuracy_score(y_train, training_pred)

print(f"Accuracy score on Training Data : {round((training_accuracy * 100), 2)}%")

Accuracy score on Training Data : 100.0%


### Accuracy on Testing Data

In [27]:
testing_pred = model_2.predict(X_test_features)
testing_accuracy = accuracy_score(y_test, testing_pred)

print(f"Accuracy score on Testing Data : {round((testing_accuracy * 100), 2)}%")

Accuracy score on Testing Data : 96.68%


#### There is a difference of 3.14% in the accuracy score of both training and testing data (No overfitting/underfitting problem)

### Observations :

#### Model 1 is likely the better option because it shows a good balance between training and testing performance, indicating that it generalizes well to new data.

#### Model 2, while having a slightly higher testing accuracy, might suffer from overfitting (due to the 100% training accuracy).

### Result : 

#### Model 1 is more reliable due to its better generalization, even though Model 2 has a marginally higher testing accuracy. Model 1 will performs well in real-world scenarios and avoids overfitting.

## Dynamic Prediction Module

In [28]:
user_input = []

input_data = input("Enter email : ")

user_input.append(input_data)

input_data_vector = v.transform(user_input)

result = model_2.predict(input_data_vector)

if result == 1:
    print("The email is SPAM!")
else:
    print("The email is NOT SPAM!")

Enter email :  This is a free course


The email is NOT SPAM!


## Model Saving & Deployment

In [30]:
pickle.dump(model_1, open("Spam_Mail_Prediction_Model.pkl", "wb"))

#### Streamlit is used for model deployment