# Spam Mail Prediction
In this notebook we will predict whether the emails are spam or not.

In [155]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

Data Collections and Pre-Processing

In [156]:
raw_mail_data = pd.read_csv('spam_ham_dataset.csv')

In [157]:
print(raw_mail_data)

      Unnamed: 0 label                                               text  \
0            605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1           2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2           3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3           4685  spam  Subject: photoshop , windows , office . cheap ...   
4           2030   ham  Subject: re : indian springs\r\nthis deal is t...   
...          ...   ...                                                ...   
5166        1518   ham  Subject: put the 10 on the ft\r\nthe transport...   
5167         404   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...   
5168        2933   ham  Subject: calpine daily gas nomination\r\n>\r\n...   
5169        1409   ham  Subject: industrial worksheets for august 2000...   
5170        4807  spam  Subject: important online banking alert\r\ndea...   

      label_num  
0             0  
1             0  
2             0  
3  

In [158]:
raw_mail_data.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [159]:
raw_mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


We don't need to do Label Encoding because the data in the label_num column is already numeric.

In [160]:
raw_mail_data = raw_mail_data.drop(columns=['Unnamed: 0','label'],axis=1)

In [161]:
raw_mail_data.head()

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


In [162]:
raw_mail_data.shape

(5171, 2)

Separating the data as texts and label

In [163]:
X = raw_mail_data['text']
y = raw_mail_data['label_num']

In [164]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=45)

Feature Extraction

In [165]:
# transform the text data to feature vectors that can be used as input to the Logistic Regression
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

In [166]:
# feature extraction for X_train and X_test values

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert y_train and y_test as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [167]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 254792 stored elements and shape (3878, 42826)>
  Coords	Values
  (0, 37147)	0.037231427156838456
  (0, 3906)	0.3190802366753676
  (0, 13247)	0.09661944114122817
  (0, 33935)	0.20485224730761595
  (0, 13428)	0.2988696207551212
  (0, 757)	0.3039841920381989
  (0, 13342)	0.11549655384148319
  (0, 6052)	0.283874308648593
  (0, 16982)	0.2015406054612089
  (0, 993)	0.1635848903649882
  (0, 27981)	0.10975730403431751
  (0, 32861)	0.19063227667073904
  (0, 16731)	0.2265636157522422
  (0, 17824)	0.24406252163795453
  (0, 5997)	0.2591585662751233
  (0, 26557)	0.10709446329898491
  (0, 17933)	0.18268060811290454
  (0, 2497)	0.2781773332762133
  (0, 27002)	0.12190951469930222
  (0, 38270)	0.08469849910984577
  (0, 22821)	0.1730595720945472
  (0, 3706)	0.3190802366753676
  (1, 37147)	0.018792654718719708
  (1, 13342)	0.058297170516772724
  (1, 25190)	0.07906233077095051
  :	:
  (3875, 16433)	0.19732504455750235
  (3875, 5458)	0.18245452

Training the Model

Logistic Regression

In [168]:
model_log = LogisticRegression() 


In [169]:
model_log.fit(X_train_features,y_train) 

In [170]:
# prediction on training data

prediction_on_training_data = model_log.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train,prediction_on_training_data)

In [171]:
print("Accuracy Score: ", accuracy_on_training_data)

Accuracy Score:  0.9966477565755544


Building a Predictive System

In [174]:
input_mail = ["You are receiving this email because you joined the Coca Cola Icecek Talent Community on 3/19/25. You will receive these messages every 7 day(s). Your Job Alert matched the following jobs at careerscci.com."]
# Convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# Making Predictions
prediction = model_log.predict(input_data_features)
print(prediction)

# Splitting results
if (prediction[0]==0):
    print("Ham mail")
else:
    print("Spam mail")

[0]
Ham mail
