In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report as cr
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score as AS

# Load and preprocess data
df = pd.read_csv('email_spam.csv', encoding='latin1')
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

def preProcessing(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

df['message'] = df['message'].apply(preProcessing)

# Feature extraction
X = df['message']
y = df['label']
vectorizer = CV()
X = vectorizer.fit_transform(X)

# Train-test split
Xtrain, Xtest, ytrain, ytest = tts(X, y, test_size=0.2, random_state=42)

# Ensemble Voting Classifier
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(max_iter=1000)

ensemble_model = VotingClassifier(estimators=[
    ('nb', nb), ('rf', rf), ('lr', lr)
], voting='hard')

ensemble_model.fit(Xtrain, ytrain)

# Evaluation
pred = ensemble_model.predict(Xtest)
print("Accuracy:", AS(ytest, pred))
print("Confusion Matrix:\n", cm(ytest, pred))
print("Classification Report:\n", cr(ytest, pred))

# Testing new input
new_inp = input("Enter a message to check:\n")
count = vectorizer.transform([new_inp])
res = ensemble_model.predict(count)
print("Spam\n" if res[0] == 1 else "Ham\n")


Accuracy: 0.9802690582959641
Confusion Matrix:
 [[965   0]
 [ 22 128]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



Enter a message to check:
 Subject: Congratulations! You've won $1,000,000  Dear Winner,  Your email was randomly selected in the Global Email Lottery. You have won ONE MILLION DOLLARS!  To claim your prize, please reply with your full name, address, and phone number.  Act fast to receive your reward!  — Global Lottery Commission


Spam

