In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

In [15]:

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [16]:
# Load Dataset
raw_data = pd.read_csv('/content/drive/MyDrive/mail_dataset.csv')
raw_data.fillna('', inplace=True)  # Replace NaN with empty string


In [18]:
# Text Preprocessing Function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    # Clean text: remove special characters, numbers, and convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    # Tokenize and lemmatize
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [19]:
# Apply preprocessing
raw_data['Message'] = raw_data['Message'].apply(preprocess_text)

# Label Encoding
raw_data['Category'] = raw_data['Category'].apply(lambda x: 0 if x == 'spam' else 1)



In [20]:
# Splitting Data
X = raw_data['Message']
Y = raw_data['Category'].astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [21]:
# TF-IDF Vectorization with parameter tuning
tfidf = TfidfVectorizer(
    min_df=2,  # Ignore words with frequency < 2
    max_df=0.9,  # Ignore overly common words
    max_features=5000,  # Limit vocabulary size
    ngram_range=(1, 2),  # Unigrams and bigrams
    stop_words='english'
)

In [22]:
X_train_fv = tfidf.fit_transform(X_train)
X_test_fv = tfidf.transform(X_test)

In [23]:
# Model Initialization and Hyperparameter Tuning
model = LogisticRegression()
param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_fv, Y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [24]:
# Best Model
best_model = grid_search.best_estimator_

# Model Evaluation
predictions = best_model.predict(X_test_fv)
accuracy = accuracy_score(Y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(Y_test, predictions))

# Confusion Matrix
cm = confusion_matrix(Y_test, predictions)
print("\nConfusion Matrix:\n", cm)


Accuracy: 98.39%

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94       149
           1       0.98      1.00      0.99       966

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
 [[132  17]
 [  1 965]]
