In [None]:
import pandas as pd # importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1', usecols=[0, 1], names=['label', 'message'], skiprows=1)
print(df.head())
# loading file and using first 2 columns only

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
print(df['label'].value_counts())
# Converting labels to binary values (ham: 0, spam: 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

label
ham     4825
spam     747
Name: count, dtype: int64


In [None]:
# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# New Section

In [None]:
# Converting text data to numerical data using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Training a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_counts, y_train)

# Making predictions
y_pred = model.predict(X_test_counts)

In [None]:
# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9838565022421525
Confusion Matrix:
 [[963   2]
 [ 16 134]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Prediction for sample message: Spam


In [None]:
# Testing with a custom message 1
sample_message = ["Congratulations! You have won a free ticket to Bahamas."]
sample_message_counts = vectorizer.transform(sample_message)
prediction = model.predict(sample_message_counts)
print("Prediction for sample message:", "Spam" if prediction[0] == 1 else "Ham")

Prediction for sample message: Spam


In [None]:
# Testing with a custom message 2
sample_message = ["Meet me at 5 in the auditorium."]
sample_message_counts = vectorizer.transform(sample_message)
prediction = model.predict(sample_message_counts)
print("Prediction for sample message:", "Spam" if prediction[0] == 1 else "Ham")


Prediction for sample message: Ham
