# 1.1 Spam Detection with NLTK


In [4]:
#import libraries

import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [6]:
# Load and Preprocess the Dataset



# Download the 'stopwords' resource
nltk.download('stopwords')

# Download the 'punkt' resource
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('spam.csv')

# Check for missing values and remove them if any
df.dropna(inplace=True)

# Preprocess the data
df['text'] = df['text'].str.lower()  # Convert text to lowercase
df['text'] = df['text'].apply(lambda x: ' '.join(word_tokenize(x)))  # Tokenize text

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# feature extraction

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']


In [8]:
# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [10]:
# Train a Naive Bayes Classifier

model = MultinomialNB()
model.fit(X_train, y_train)


In [11]:
# Evaluate the Model

y_pred = model.predict(X_test)

# Accuracy score
print(f'Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%')

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 95.71%
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       781
           1       0.99      0.93      0.96       896

    accuracy                           0.96      1677
   macro avg       0.96      0.96      0.96      1677
weighted avg       0.96      0.96      0.96      1677

[[769  12]
 [ 60 836]]
