In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# 1. Load the Dataset
# You can use a publicly available dataset, e.g., the SMS Spam Collection dataset.
# For Colab, you might need to upload it or fetch it from a URL.
# Example: If you have 'spam.csv' uploaded to your Colab environment:
# df = pd.read_csv('spam.csv', encoding='latin-1') # Adjust encoding if needed
# A common format for this dataset has two columns: 'v1' for label and 'v2' for message.
# Let's create a dummy dataset for demonstration if you don't have a file ready:


data = {
    'label': ['ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham'],
    'message': [
        'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat?',
        'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\'s apply 08452810075 fordetails',
        'U dun say so early hor... U c already then say...',
        'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
        'Nah I don\'t think he goes to usf, he lives around here though',
        'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
        'I\'m gonna be home soon and i don\'t want to talk about this stuff anymore tonight, k? I\'ve cried enough today.',
        'I HAVE A DATE ON SUNDAY WITH WILL!!',
        'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days/wk, TsandCs apply Reply HL 4 info',
        'As per your request \'Melle Melle (Oru Minnaminunginte Nurungu Vettam)\' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune'
    ]
}
df = pd.DataFrame(data)

# Rename columns for clarity
df.columns = ['Category', 'Message']

# 2. Preprocess the Data
# Convert labels to numerical format (0 for ham, 1 for spam)
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})




In [None]:
# 3. Split Data into Training and Testing Sets
X = df['Message']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Text Vectorization (Feature Extraction)
# Use CountVectorizer to convert text messages into numerical feature vectors (word counts).
# This creates a "bag-of-words" model.
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test) # Use transform, not fit_transform, on test data

# You can inspect the vocabulary and shape of the vectors
# print(vectorizer.get_feature_names_out())
# print(X_train_vectors.shape)


In [None]:
# 5. Train the Multinomial Naive Bayes Model
# alpha=1.0 applies Laplace smoothing
mnb_classifier = MultinomialNB(alpha=1.0)
mnb_classifier.fit(X_train_vectors, y_train)

# 6. Make Predictions
y_pred = mnb_classifier.predict(X_test_vectors)

In [None]:
# 7. Evaluate the Model
print("--- Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# 8. Test with New Messages
def predict_spam(message):
    message_vector = vectorizer.transform([message])
    prediction = mnb_classifier.predict(message_vector)
    return "Spam" if prediction[0] == 1 else "Ham"

print("\n--- Testing with Custom Messages ---")
test_message1 = "Congratulations! You've won a free iPhone. Click this link now!"
print(f"'{test_message1}' is classified as: {predict_spam(test_message1)}")

test_message2 = "Hey, let's meet up for coffee tomorrow."
print(f"'{test_message2}' is classified as: {predict_spam(test_message2)}")

test_message3 = "Your bank account has been compromised. Verify your details immediately."
print(f"'{test_message3}' is classified as: {predict_spam(test_message3)}")

test_message3 = "rob the bank."
print(f"'{test_message3}' is classified as: {predict_spam(test_message3)}")