In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

In [None]:
# Read DataSet
raw_dataset =pd.read_csv("spam.csv",encoding='latin1')

In [None]:
#Top 10 Rcords
raw_dataset.head(10)

In [None]:
# Drop Unwanted Columns
raw_dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
raw_dataset.head(10)

In [None]:
#Shape of Dataset 
raw_dataset.shape

In [None]:
raw_dataset['v1'].value_counts()

In [None]:
# Check Null values
raw_dataset.isnull().sum()

In [None]:
raw_dataset.info()

In [None]:
raw_dataset.describe()

In [None]:
mail_data=raw_dataset.where(pd.notnull(raw_dataset),'')

In [None]:
#Rename column name
mail_data.rename(columns={"v1":"Category","v2":"Message"},inplace=True)

In [None]:
mail_data.head(5)

In [None]:
# Convert ham mail to 1 and spam mail to 0
mail_data["Category"].replace("ham",1,inplace=True)
mail_data["Category"].replace("spam",0,inplace=True)

In [None]:
mail_data["Category"].nunique()

In [None]:
# Separating the Data in Text and labels
X = mail_data["Message"]
Y =mail_data["Category"]

In [None]:
## Spliting the Dataset and Training and Test Data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=42)

In [None]:
# X's Data Shape
print(X.shape)
print(X_train.shape)
print(X_test.shape)

In [None]:
# Y's Data Shape
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
# Features Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction =TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
X_train_features= feature_extraction.fit_transform(X_train)
X_test_features= feature_extraction.transform(X_test)

# convert Y train and Y test dataset into integer

Y_train =Y_train.astype('int')
Y_test =Y_test.astype('int')

In [None]:
#print(X_train_features)
print(X_test_features)

In [None]:
# Training the model
model = LogisticRegression()
model.fit(X_train_features,Y_train)

In [None]:
# Evaluating the Training Model
prediction_on_training_data = model.predict(X_train_features)
accurracy_on_training_data = accuracy_score(prediction_on_training_data,Y_train)
print(f"The accuaracy on the training Data Set : {accurracy_on_training_data}")

In [None]:
# Evaluating the Test Model
prediction_on_test_data = model.predict(X_test_features)
accurracy_on_test_data = accuracy_score(prediction_on_test_data,Y_test)
print(f"The accuaracy on the test Data Set : {accurracy_on_test_data}")

In [None]:
#Building predictive Model
input_mail =[" Hi Good Morning"]

# convert text to feature extraction
input_data =feature_extraction.transform(input_mail)

# making Prediction 
input_predict =model.predict(input_data)
print(input_predict)
if(input_predict[0]==1):
    print("It is not a spam mail")
else :
    print("It is a spam mail")

#### Term Frequency: Term frequency (TF) is a numerical statistic that indicates how often a term (word) appears in a document .

#####  The term frequency of a term t in a document 𝑑 is calculated using the formula:

#### TF(t,d)= Total number of terms in document d/Number of times term t appears in document d

#### IDF(t) = log_e(Total number of documents / Number of documents with term x in it)

In [None]:
import matplotlib.pyplot as plt
image = plt.imread('1_ALj6IYuGFr2PObkT5o1-rQ.webp')
plt.imshow(image)
plt.axis('off')  # Turn off axis
plt.show()