In [13]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load your dataset
# Make sure your CSV has two columns: 'email_text' and 'label' (0 = not spam, 1 = spam)
data = pd.read_csv('/content/spam.csv')  # Replace with your dataset path

# Step 2: Separate features and labels
X = data['Message']
y = data['Category']

# Step 3: Convert text to numerical data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)

# Step 4: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Step 5: Train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 7: Define a function to predict if a new email is spam or not
def predict_email(text):
    # Convert input email text to TF-IDF vector
    text_vec = vectorizer.transform([text])
    # Predict label
    prediction = model.predict(text_vec)
    return "Spam" if prediction[0] == 'spam' else "Not Spam"

# Step 8: Test the prediction function
sample_email_1 = "Congratulations! You've won a $1000 gift card. Click here to claim now."
sample_email_2 = "Hi team, please find the meeting agenda attached."

print("\nSample Email 1 Prediction:", predict_email(sample_email_1))
print("Sample Email 2 Prediction:", predict_email(sample_email_2))

Model Accuracy: 0.9775784753363229

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99       966
        spam       1.00      0.83      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Sample Email 1 Prediction: Spam
Sample Email 2 Prediction: Not Spam


In [17]:
# Filter the dataset to get spam and ham emails
spam_emails = data[data['Category'] == 'spam']['Message']
ham_emails = data[data['Category'] == 'ham']['Message']

# Select a few of each to test
num_samples_per_category = 10 # You can adjust this number

sample_spam = spam_emails.sample(num_samples_per_category, random_state=42)
sample_ham = ham_emails.sample(num_samples_per_category, random_state=42)

# Combine the samples
combined_samples = pd.concat([sample_spam, sample_ham])

# Predict the category for each selected email
print("Predictions for a mix of spam and not spam emails:")
for email in combined_samples:
    prediction = predict_email(email)
    print(f"Email: {email[:50]}... - Prediction: {prediction}")

Predictions for a mix of spam and not spam emails:
Email: Summers finally here! Fancy a chat or flirt with s... - Prediction: Spam
Email: This is the 2nd time we have tried 2 contact u. U ... - Prediction: Spam
Email: Get ur 1st RINGTONE FREE NOW! Reply to this msg wi... - Prediction: Spam
Email: Ur cash-balance is currently 500 pounds - to maxim... - Prediction: Spam
Email: Last Chance! Claim ur £150 worth of discount vouch... - Prediction: Spam
Email: You can stop further club tones by replying "STOP ... - Prediction: Spam
Email: For ur chance to win a £250 cash every wk TXT: ACT... - Prediction: Spam
Email: WIN a year supply of CDs 4 a store of ur choice wo... - Prediction: Spam
Email: YOU HAVE WON! As a valued Vodafone customer our co... - Prediction: Spam
Email: sports fans - get the latest sports news str* 2 ur... - Prediction: Spam
Email: If i not meeting ü all rite then i'll go home lor.... - Prediction: Not Spam
Email: I.ll always be there, even if its just in spirit. ... - Pr