In [58]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [59]:
dataset = pd.read_csv(r'D:\email-spam-detector\spam.csv', encoding='latin1')
print(dataset.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [60]:
dataset.shape

(5572, 5)

In [61]:
#remove unnecessary columns only keeping the label and message columns
dataset = dataset[['v1', 'v2']]
dataset.columns = ['label', 'message']
print(dataset.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [62]:
dataset.shape

(5572, 2)

In [63]:
!pip install nltk





[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [64]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sajee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
# Data Preprocessing
import string   #Used to access common string constants like string.punctuation (all punctuation characters).
import re   #Regular expressions for string matching and manipulation  
from nltk.corpus import stopwords   #Used to access a list of common stop words in English like 'the', 'is', etc.
from nltk.stem import WordNetLemmatizer #Used for lemmatization, which reduces words to their base or root form. running -> run
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Deriving the cleaning function
def clean_text(text):
    text=text.lower()  #convert to lowercase
    text=re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  #remove punctuation
    text=re.sub(r'\d+', ' ', text)  #remove digits 
    text=re.sub(r'\s+', ' ', text)  #remove extra whitespace
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  #remove stop words and lemmatize
    return text

In [66]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sajee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [67]:
# apply the cleaning function to the 'message' column only
dataset['message'] = dataset['message'].apply(clean_text)
print(dataset.head())

  label                                            message
0   ham  go jurong point crazy available bugis n great ...
1   ham                            ok lar joking wif u oni
2  spam  free entry wkly comp win fa cup final tkts st ...
3   ham                u dun say early hor u c already say
4   ham                nah think go usf life around though


In [68]:
#vectorization coverting the labels data into numerical values ham = 0, spam = 1
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()  
dataset['label'] = label_encoder.fit_transform(dataset['label'])
print(dataset.head())

   label                                            message
0      0  go jurong point crazy available bugis n great ...
1      0                            ok lar joking wif u oni
2      1  free entry wkly comp win fa cup final tkts st ...
3      0                u dun say early hor u c already say
4      0                nah think go usf life around though


In [69]:
!pip install seaborn





[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
#Training
from sklearn.model_selection import train_test_split
X = dataset['message']  
y = dataset['label']    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)
#Vectorization  
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()  
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
#Model Training
#from sklearn.naive_bayes import MultinomialNB
#model = MultinomialNB()
from sklearn.linear_model import LogisticRegression
#class_weight='balanced' is used since dataset has majority of ham mails
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_vectorized, y_train)
#Model Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
y_pred = model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9766816143497757
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.91      0.91      0.91       149

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[953  13]
 [ 13 136]]


In [71]:
import pickle

# Save trained model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
