In [6]:
import pandas as pd


# Load the dataset
file_path = "spam_ham_dataset.csv"
df = pd.read_csv(file_path)

# Display basic info about the dataset
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


(None,
    Unnamed: 0 label                                               text  \
 0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
 1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
 2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
 3        4685  spam  Subject: photoshop , windows , office . cheap ...   
 4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   
 
    label_num  
 0          0  
 1          0  
 2          0  
 3          1  
 4          0  )

In [7]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download stopwords if not available
nltk.download('stopwords')

# Load the dataset
file_path ="spam_ham_dataset.csv"  # Path to uploaded dataset
df = pd.read_csv(file_path)

# Inspect dataset columns
print(df.head())

# Check column names (adjust if necessary)
##if 'label' not in df.columns or 'text' not in df.columns:
##    print("Check the column names. Expected columns: 'label' and 'text'")
##    exit()

# Convert labels to binary (0 for ham, 1 for spam)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    words = text.split()
    stop_words = set(stopwords.words('english'))  # Load stopwords once
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# Apply preprocessing
df['clean_text'] = df['text'].apply(preprocess_text)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# Convert text into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Naïve Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 0.9536

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       742
           1       0.92      0.92      0.92       293

    accuracy                           0.95      1035
   macro avg       0.94      0.94      0.94      1035
weighted avg       0.95      0.95      0.95      1035


Confusion Matrix:
 [[717  25]
 [ 23 270]]


In [8]:
import joblib
joblib.dump(model, "spam_classifier.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [9]:
import joblib

# Load the trained model & vectorizer
spam_model = joblib.load("spam_classifier.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Function to classify a new email
def classify_email(email_text):
    email_tfidf = vectorizer.transform([email_text])  # Convert text to TF-IDF
    spam_prob = spam_model.predict_proba(email_tfidf)[0][1] * 100  # Get spam probability
    return {"spam_probability": round(spam_prob, 2), "prediction": "Spam" if spam_prob > 50 else "Not Spam"}

# Test on a new email
email_text = "Congratulations! You won a free iPhone. Click here to claim."
print(classify_email(email_text))


{'spam_probability': 84.47, 'prediction': 'Spam'}


In [10]:
new_email = ["Dear all Here is the Result of fourth day matches first match played between kingfisher vs shadow strikers "]
new_email_tfidf = vectorizer.transform(new_email)
prediction = model.predict(new_email_tfidf)
spam_prob = model.predict_proba(new_email_tfidf)[0][1] * 100  # Get spam probability

print(f"Prediction: {'Spam' if prediction[0] else 'Not Spam'}")
print(f"Spam Probability: {spam_prob:.2f}%")


Prediction: Not Spam
Spam Probability: 36.34%
