<a href="https://colab.research.google.com/github/art-lmk/Email-Spam-Filter-app-Using-TfidfVectorizer/blob/main/Email_Spam_Filter_using_TF_IDF_Vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import necessary dependencies and libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings

warnings. filterwarnings('ignore')

In [2]:
#load the data

df= pd.read_csv('spam.csv')
df.head(10)
df.shape


(5572, 2)

In [3]:
#Exploratory Data Analysis(EDA)
df.isnull().sum()

df.drop_duplicates(inplace=True)


df['Category']= df['Category'].replace(['spam', 'ham'], ['Spam', 'Not Spam'])


#lets do label encoding
df.loc[df['Category']=='Spam','Category',]=0
df.loc[df['Category']=='Not Spam','Category',]=1
df.head(10)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


In [4]:
#Feature Engineering

x= df['Message']
y= df['Category']

#print(y)
#print(x)


# Train-test split
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=3)

print(x_train.shape)
print(x_test.shape)

(4125,)
(1032,)


In [7]:
#convert now the data intonumerical values that can used as an input inside a model

f= TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_count= f.fit_transform(x_train)
x_test_count= f.transform(x_test)# we do not fit the test data


In [9]:
#convert y_train and y_test as integers(1 or 0)
y_train= y_train.astype('int')
y_test= y_test.astype('int')

print(x_train_count)#

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 31535 stored elements and shape (4125, 7431)>
  Coords	Values
  (0, 2400)	0.4689535663823655
  (0, 1247)	0.5538832733861689
  (0, 6605)	0.4898673616987752
  (0, 6692)	0.48303813512243965
  (1, 6492)	0.5755914257195885
  (1, 5859)	0.5964494866231046
  (1, 1592)	0.5594126567616489
  (2, 5786)	0.13178759818933938
  (2, 4038)	0.24488128414489752
  (2, 6925)	0.1186988586975311
  (2, 4089)	0.2196593047164816
  (2, 4522)	0.2363176943466334
  (2, 3695)	0.21568440262445418
  (2, 3252)	0.18176623831152225
  (2, 6828)	0.13848562282513652
  (2, 4084)	0.2644704778405301
  (2, 4519)	0.24488128414489752
  (2, 4520)	0.24488128414489752
  (2, 4393)	0.24488128414489752
  (2, 798)	0.24488128414489752
  (2, 2564)	0.19505111090227498
  (2, 25)	0.24488128414489752
  (2, 682)	0.22967525805125708
  (2, 7355)	0.21568440262445418
  (2, 5064)	0.22967525805125708
  :	:
  (4120, 2101)	0.3080717396234338
  (4120, 3921)	0.3922048767024034
  (4120, 3181)	0

The above output has integers as the message instead of a string.
The message is a score based on the vectorizer function(TFidfVectorizer).
Machines understand numbers

In [10]:
#train the data
model= LogisticRegression()
model.fit(x_train_count,y_train)

In [11]:
#model evaluation on the trained data
predicted_data= model.predict(x_train_count)
accuracy= accuracy_score(y_train,predicted_data)
print(accuracy)

0.961939393939394


The accuracy score of the trained data is 96%

In [13]:
#model evaluation on the test data
predicted_data= model.predict(x_test_count)
accuracy= accuracy_score(y_test,predicted_data)
print(accuracy)

0.9544573643410853


The accuracy score on the test is 95%. We can conclude that the model is not overfitted.
The model does well on both trained and test data

In [16]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m97.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.49.1


In [21]:
import streamlit as st

def predict(Message):
  input_message_count = f.transform([Message])
  result = model.predict(input_message_count)[0]
  return result

st.header('Spam Detection')
message = st.text_input('ENTER THE MESSAGE HERE')

if st.button('PREDICT'):
  result = predict(message)
  if result == 0:
    st.header('SPAM')
  else:
    st.header('NOT SPAM')

