In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define text cleaning function
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [None]:
# Load dataset (ensure the file 'emails.csv' is in your working directory)
dataset_path = 'emails.csv'
df = pd.read_csv(dataset_path, encoding='latin-1')

# Check dataset info and preview
print(df.info())
print(df.head())


In [None]:
# Apply text cleaning to the email content
df['clean_text'] = df['Text'].apply(clean_text)


In [None]:
# Define features and target variable
X = df['clean_text']
y = df['Spam']

# Split the dataset (80% train, 20% test) with stratification for class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Initialize TF-IDF vectorizer (limit to top 5000 features)
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data; transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Create and train the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)


In [None]:
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Model Performance:')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-Score:', f1)
print('\nClassification Report:\n', classification_report(y_test, y_pred))


In [None]:
# Save the classifier and vectorizer for future use
joblib.dump(nb_classifier, 'spam_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print('Model and vectorizer saved successfully!')
