In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv("neww.csv")
print(df)

                subject                                               body  \
0      Congratulations!  You have been selected to receive a $1000 gift...   
1       Earn money fast                  Make $500 a day from home easily!   
2         Weekly report  Please review the attached weekly performance ...   
3         Limited offer               Get 50% off all products today only!   
4                Lunch?                 Want to grab lunch together later?   
..                  ...                                                ...   
196     Earn money fast                  Make $500 a day from home easily!   
197        Your invoice  Attached is the invoice for your recent purchase.   
198     Earn money fast                  Make $500 a day from home easily!   
199    Project deadline  Reminder: the deadline for project submission ...   
200  Win a free iPhone!                Click here to claim your prize now!   

    label  
0    spam  
1    spam  
2     ham  
3    spam  
4  

In [3]:
data=df.where((pd.notnull(df)),'')

In [4]:
print(data.head())

            subject                                               body label
0  Congratulations!  You have been selected to receive a $1000 gift...  spam
1   Earn money fast                  Make $500 a day from home easily!  spam
2     Weekly report  Please review the attached weekly performance ...   ham
3     Limited offer               Get 50% off all products today only!  spam
4            Lunch?                 Want to grab lunch together later?   ham


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  201 non-null    object
 1   body     201 non-null    object
 2   label    201 non-null    object
dtypes: object(3)
memory usage: 4.8+ KB


In [6]:
data.loc[data["label"]=='spam',"label",]=0
data.loc[data["label"]=='ham',"label",]=1


In [7]:
x=data["subject"]
y=data["label"]

In [8]:
print(x)

0        Congratulations!
1         Earn money fast
2           Weekly report
3           Limited offer
4                  Lunch?
              ...        
196       Earn money fast
197          Your invoice
198       Earn money fast
199      Project deadline
200    Win a free iPhone!
Name: subject, Length: 201, dtype: object


In [9]:
print(y)

0      0
1      0
2      1
3      0
4      1
      ..
196    0
197    1
198    0
199    1
200    0
Name: label, Length: 201, dtype: object


In [10]:
x_test,x_train,y_test,y_train=train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
print(x.shape)
print(x_train.shape)
print(x_test.shape) 


(201,)
(41,)
(160,)


In [12]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(201,)
(41,)
(160,)


In [13]:
feature_extractor=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

x_train_features=feature_extractor.fit_transform(x_train)
x_test_features=feature_extractor.transform(x_test)

y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [14]:
print(x_train_features)
print(x_test_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 75 stored elements and shape (41, 19)>
  Coords	Values
  (0, 10)	0.7071067811865475
  (0, 14)	0.7071067811865475
  (1, 17)	0.7071067811865475
  (1, 16)	0.7071067811865475
  (2, 12)	0.7071067811865476
  (2, 0)	0.7071067811865476
  (3, 17)	0.7071067811865475
  (3, 16)	0.7071067811865475
  (4, 12)	0.7071067811865476
  (4, 0)	0.7071067811865476
  (5, 8)	1.0
  (6, 3)	1.0
  (7, 15)	0.7071067811865475
  (7, 4)	0.7071067811865475
  (8, 8)	1.0
  (9, 18)	0.5773502691896258
  (9, 7)	0.5773502691896258
  (9, 9)	0.5773502691896258
  (10, 15)	0.7071067811865475
  (10, 4)	0.7071067811865475
  (11, 8)	1.0
  (12, 18)	0.5773502691896258
  (12, 7)	0.5773502691896258
  (12, 9)	0.5773502691896258
  (13, 18)	0.5773502691896258
  :	:
  (27, 10)	0.7071067811865475
  (27, 14)	0.7071067811865475
  (28, 11)	1.0
  (29, 5)	0.5773502691896258
  (29, 13)	0.5773502691896258
  (29, 6)	0.5773502691896258
  (30, 11)	1.0
  (31, 11)	1.0
  (32, 12)	0.70710678118

In [15]:
model=LogisticRegression()
model.fit(x_train_features,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [16]:
Prediction_on_training_data=model.predict(x_train_features)
accuracy_on_training_data=accuracy_score(y_train,Prediction_on_training_data)

In [17]:
print("accuracy_on_training_data:",accuracy_on_training_data)

accuracy_on_training_data: 1.0


In [18]:
prediction_on_test=model.predict(x_test_features)

In [19]:
accuracy_on_test_data=accuracy_score(y_test,prediction_on_test)

In [20]:
print("accuracy on test data:" ,accuracy_on_test_data)

accuracy on test data: 1.0


In [24]:
input=["Congratulations!!! You’ve WON a $1,000 Gift Card 🎁 "]
new=feature_extractor.transform(input)
pred=model.predict(new)
print(pred)
if(pred[0]==0):
    print("SPAM MAIL")
elif(pred[0]==1):
    print("HAM MAIL")    


[0]
SPAM MAIL
