In [45]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [46]:
df = pd.read_csv('spam.csv', encoding='latin1')
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [47]:
data = df.where((pd.notnull(df)), '')

In [48]:
data.head(15)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5578 entries, 0 to 5577
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5578 non-null   object
 1   v2          5578 non-null   object
 2   Unnamed: 2  5578 non-null   object
 3   Unnamed: 3  5578 non-null   object
 4   Unnamed: 4  5578 non-null   object
dtypes: object(5)
memory usage: 218.0+ KB


In [50]:
data.shape

(5578, 5)

In [51]:
data.loc[data['v1'] == 'spam', 'v1'] = 0
data.loc[data['v1'] == 'ham', 'v1'] = 1

In [52]:
X = data['v2']  # The messages
Y = data['v1']

In [53]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5573    Congratulations! You've won $1000! Click here ...
5574            Important: Your account has been updated.
5575           Buy one, get one free! Limited time offer!
5576    Meeting at 3 PM, please confirm your availabil...
5577           Earn money online, no investment required!
Name: v2, Length: 5578, dtype: object


In [54]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5573    0
5574    1
5575    0
5576    1
5577    0
Name: v1, Length: 5578, dtype: object


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 3)

In [55]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5578,)
(4457,)
(1115,)


In [56]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5578,)
(4457,)
(1115,)


In [57]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)
 
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [58]:
print(X_train)

3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: v2, Length: 4457, dtype: object


In [59]:
print(X_train_features)

  (0, 4513)	0.2909649098524696
  (0, 3380)	0.21807195185332803
  (0, 3262)	0.25877035357606315
  (0, 3136)	0.440116181574609
  (0, 2122)	0.38613577623520473
  (0, 3386)	0.3219352588930141
  (0, 6599)	0.20296878731699391
  (0, 4296)	0.3891385935794867
  (0, 3979)	0.2410582143632299
  (0, 741)	0.3219352588930141
  (1, 7443)	0.35056971070320353
  (1, 6442)	0.5652509076654626
  (1, 6417)	0.4769136859540388
  (1, 6872)	0.4306015894277422
  (1, 4061)	0.380431198316959
  (2, 5825)	0.4917598465723273
  (2, 2226)	0.413484525934624
  (2, 3917)	0.40088501350982736
  (2, 2109)	0.42972812260098503
  (2, 933)	0.4917598465723273
  (3, 7453)	0.5202633571003087
  (3, 1842)	0.3708680641487708
  (3, 1599)	0.5927091854194291
  (3, 6140)	0.4903863168693604
  (4, 1842)	0.36051481797205776
  :	:
  (4452, 4636)	0.4030918768627523
  (4453, 1762)	0.45610005640082985
  (4453, 7273)	0.5787739591782677
  (4453, 999)	0.6760129013031282
  (4454, 5370)	0.42618909997886
  (4454, 7346)	0.31166263834107377
  (4454, 1049

create logistic regression to train the model

In [60]:
model = LogisticRegression()

In [61]:
model.fit(X_train_features, Y_train)

In [62]:
prediction_on_training_data= model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [63]:
print('acc on training data :' , accuracy_on_training_data)

acc on training data : 0.9661207089970832


In [64]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)


In [65]:
print('acc on test data : ', accuracy_on_test_data)

acc on test data :  0.9623318385650225


In [66]:
input_email = [input("Enter the email content: ")]
print(f"Input email: {input_email}")

try:
    input_data_features = feature_extraction.transform(input_email)
    prediction = model.predict(input_data_features)
    print(f"Prediction result: {prediction}")

    if prediction[0] == 1:
        print("Ham mail")
    else:
        print("Spam mail")
except Exception as e:
    print(f"An error occurred: {e}")


Input email: ['Enter the email content: Win a free iPhone! Click here! ']
Prediction result: [0]
Spam mail
