# Project :- Email Spam Detection

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Dataset Description

Message Feature:
    Represents the textual content of the email, including the body of the message.
Category Feature:
    Indicates the classification or category assigned to each email.

In [2]:
data=pd.read_csv('mail1.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
len(data.index)

5572

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
data.loc[data["Category"]=='spam','Category']=0
data.loc[data["Category"]=='ham','Category']=1

In [6]:
x=data["Message"]
y=data["Category"]

In [7]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [8]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

# TRAINING AND TESTING

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=4)

In [10]:
x_train

1256    Just wait till end of march when el nino gets ...
4163    How's it going? Got any exciting karaoke type ...
1994                   Eh den sat u book e kb liao huh...
3587    I am hot n horny and willing I live local to y...
1598    URGENT! Your Mobile number has been awarded wi...
                              ...                        
3671                        Ok thanx... Take care then...
709     To review and KEEP the fantastic Nokia N-Gage ...
2487           I dont thnk its a wrong calling between us
174     Bloody hell, cant believe you forgot my surnam...
1146    Thank you, winner notified by sms. Good Luck! ...
Name: Message, Length: 3900, dtype: object

In [11]:
x_test

4004    somewhere out there beneath the pale moon ligh...
2276           Is that on the telly? No its Brdget Jones!
4498                                                   Ok
3755    Bloomberg -Message center +447797706009 Why wa...
111              What is the plural of the noun research?
                              ...                        
231     Get down in gandhipuram and walk to cross cut ...
2038                             Oh sorry please its over
3309    But i'm surprised she still can guess right lo...
3934                               Playin space poker, u?
3156                                                Ok...
Name: Message, Length: 1672, dtype: object

In [12]:
y_train

1256    1
4163    1
1994    1
3587    0
1598    0
       ..
3671    1
709     0
2487    1
174     1
1146    0
Name: Category, Length: 3900, dtype: object

In [13]:
y_test

4004    1
2276    1
4498    1
3755    0
111     1
       ..
231     1
2038    1
3309    1
3934    1
3156    1
Name: Category, Length: 1672, dtype: object

In [14]:
# transform string data into numerical value by feature 
feature_extraction=TfidfVectorizer(min_df = 1, stop_words='english')
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [15]:
print(x_train_features)

  (0, 4410)	0.24909367293271087
  (0, 2786)	0.3532549360877084
  (0, 4304)	0.4348669067683049
  (0, 2288)	0.4348669067683049
  (0, 3908)	0.3585975959370433
  (0, 2312)	0.3028887592062596
  (0, 6201)	0.3263476432164432
  (0, 6608)	0.2677403965759536
  (0, 3454)	0.19084888936848213
  (1, 3613)	0.2832563402788741
  (1, 2508)	0.25332700045998413
  (1, 2379)	0.26279544760276824
  (1, 2620)	0.30679987315481655
  (1, 4698)	0.23638965777502066
  (1, 1971)	0.34350149001336333
  (1, 4694)	0.29949706446131497
  (1, 745)	0.3157377886437559
  (1, 6387)	0.2752185650145473
  (1, 3475)	0.3272607658309224
  (1, 2409)	0.29332262768846595
  (1, 2855)	0.16951229995043515
  (1, 2832)	0.17755007521476196
  (2, 3154)	0.3830132347130307
  (2, 3658)	0.33936075831915996
  (2, 3483)	0.432440133554435
  :	:
  (3897, 2162)	0.3606334209782012
  (3898, 1169)	0.3623576524486404
  (3898, 1675)	0.3623576524486404
  (3898, 5958)	0.3623576524486404
  (3898, 5705)	0.34522541034807125
  (3898, 3030)	0.3159376490309816
  (3

In [16]:
model=LogisticRegression()
model.fit(x_train_features,y_train)

In [17]:
#predictions on training data
prediction=model.predict(x_train_features)
accuracy_score=accuracy_score(y_train,prediction)

In [21]:
print(f'Accuracy on training data:',accuracy_score)

Accuracy on training data: 0.9669230769230769


In [22]:
#predictions on testing data
predictions_on_testing_data=model.predict(x_test_features)
predictions_on_testing_data


array([1, 1, 1, ..., 1, 1, 1])

# testing the model

In [28]:
input_mail=["Hello, my love. What are you doing? Did you get to that interview today? Are you you happy? Are you being a good boy? Do you think of me?Are you missing me ?"]
input_data_features=feature_extraction.transform(input_mail)
prediction=model.predict(input_data_features)
print(prediction)

if(prediction[0]==1):
    print("Ham mail")
else:
    print("Spam mail")

[1]
Ham mail
