In [6]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [47]:
#loading csv file
df = pd.read_csv("mail_data.csv")

In [25]:
#inspecting data
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
#looking for null values
df.isna().sum()

Category    0
Message     0
dtype: int64

In [29]:
df.loc[df["Message"]=="null"].count()

Category    0
Message     0
dtype: int64

In [48]:
#labling ham mails as 1 and spam mails as 0
df.loc[df["Category"] == "ham", "Category"] = 1
df.loc[df["Category"] == "spam", "Category"] = 0

In [49]:
#Storing data in X and Y variables
X = df["Message"]
Y = df["Category"]

In [31]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [50]:
Y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

In [51]:
# spliting data in to training and testing sets with 80% train data and 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=80)

In [52]:
X_train

3534                               Sorry, I'll call later
2578                Hey whats up? U sleeping all morning?
214                               Yup... How ü noe leh...
292     Haf u found him? I feel so stupid da v cam was...
1713    o turns out i had stereo love on mi phone unde...
                              ...                        
522                            Shall i come to get pickle
2259    Sad story of a Man - Last week was my b'day. M...
2982    7 wonders in My WORLD 7th You 6th Ur style 5th...
5308    I'm coming back on Thursday. Yay. Is it gonna ...
1199    Al he does is moan at me if n e thin goes wron...
Name: Message, Length: 4457, dtype: object

In [53]:
Y_train.head()

3534    1
2578    1
214     1
292     1
1713    1
Name: Category, dtype: object

In [54]:
Y_train = Y_train.astype("int")
Y_test = Y_test.astype("int")

In [55]:
#feature extraction using tfidf
feature_ext = TfidfVectorizer(min_df=1, stop_words="english", lowercase="True")

In [56]:
X_train_features = feature_ext.fit_transform(X_train)
X_test_features = feature_ext.transform(X_test)

In [59]:
print(X_train_features)

  (0, 3915)	0.6094882587220327
  (0, 4045)	0.52281582270503
  (0, 6144)	0.5959762394670168
  (1, 4463)	0.43404183022239895
  (1, 6039)	0.5501359457582253
  (1, 7267)	0.5865913104172977
  (1, 3343)	0.4060403494048474
  (2, 3958)	0.5770420234118241
  (2, 4663)	0.6076975111771694
  (2, 7501)	0.5456429584681379
  (3, 7373)	0.3910135606511955
  (3, 1582)	0.5019851005653218
  (3, 2088)	0.3000130969348083
  (3, 6367)	0.44559202093502004
  (3, 2740)	0.3523891297958715
  (3, 3222)	0.4270610588026448
  (4, 895)	0.42025744552976463
  (4, 6958)	0.42025744552976463
  (4, 5038)	0.2422869895536978
  (4, 4346)	0.44077803171644436
  (4, 4113)	0.22612451546670512
  (4, 6292)	0.44077803171644436
  (4, 6880)	0.3851772717384851
  (5, 349)	0.264055054929898
  (5, 6378)	0.264055054929898
  :	:
  (4455, 7455)	0.3411207423483332
  (4455, 7457)	0.2377863541861799
  (4455, 1862)	0.25702345173534047
  (4455, 4445)	0.25599536742876106
  (4455, 3437)	0.30941076551340585
  (4455, 7369)	0.22910540516581354
  (4455, 9

In [60]:
model = LogisticRegression()

In [61]:
model.fit(X_train_features, Y_train)

LogisticRegression()

In [62]:
#evaluating the model
prediction_on_train = model.predict(X_train_features)
accuracy_on_training = accuracy_score(Y_train, prediction_on_train)
print(accuracy_on_training)

0.9663450751626654


In [64]:
print(f"accuracy score on test data is {accuracy_on_training}")

accuracy score on test data is 0.9663450751626654


In [65]:
prediction_on_test = model.predict(X_test_features)
accuracy_on_test = accuracy_score(Y_test, prediction_on_test)

In [66]:
print(f"accuracy score on test data is {accuracy_on_test}")

accuracy score on test data is 0.9632286995515695


In [84]:
test_message_list = []  #creating an empty list
test_message = input()  #getting email from user
test_message_list.append(test_message)  #adding the email in the list
featured_input_message = feature_ext.transform(test_message_list) #transforming the text email to numeric features
predicting_label = model.predict(featured_input_message) #predicting label
test_message_list = []

final_prediction = 0
if predicting_label == 1:
    final_prediction = "Ham"
else:
    final_prediction = "Spam"
print(f"Your email is {final_prediction}")


URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word
Your email is Spam
