In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [3]:
# Map labels to binary values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [4]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)

In [9]:
# Create a pipeline that combines the CountVectorizer, TfidfTransformer, and MultinomialNB classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [6]:
# Train the model
pipeline.fit(X_train, y_train)

In [7]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [10]:
print(f'Accuracy: {accuracy}')

Accuracy: 0.9629186602870813


In [11]:
print('Confusion Matrix:')

Confusion Matrix:


In [12]:
print(conf_matrix)

[[1448    0]
 [  62  162]]


In [13]:
print('Classification Report:')

Classification Report:


In [14]:
print(class_report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1448
           1       1.00      0.72      0.84       224

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672

