# Data Loading & Text Preprocessing

In [2]:
# Import required libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download stopwords from NLTK
nltk.download('stopwords')

# 1. Load the dataset (Using SMS Spam Collection dataset)
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# 2. Map target labels to numerical values (ham: 0, spam: 1)
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Initialize stemmer and define English stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# 3. Define the text preprocessing function
def clean_text(text):
    # Remove special characters, numbers, and punctuations
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert all text to lowercase
    text = text.lower()
    # Tokenize the text into individual words
    text = text.split()
    # Remove stop words and apply stemming
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    # Join the processed words back into a single string
    return ' '.join(text)

# 4. Apply the preprocessing function to the dataset
print("Cleaning the text data. Please wait...")
df['clean_message'] = df['message'].apply(clean_text)

print("Data preprocessing completed successfully.")
print(df[['message', 'clean_message']].head())

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                       clean_message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri wkli comp win fa cup final tkt st m...  
3                u dun say earli hor u c alreadi say  
4               nah think goe usf live around though  


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Feature Extraction using TF-IDF Vectorizer
# Limiting to top 3000 features for optimal performance
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['clean_message']).toarray()
y = df['label_num']

# 2. Split the dataset into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize and train the Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# 4. Make predictions on the testing set
predictions = model.predict(X_test)

# 5. Evaluate the model's performance
print("Model Evaluation Metrics:\n")
print(f"Accuracy Score: {accuracy_score(y_test, predictions):.4f}\n")

print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Ham (Not Spam)', 'Spam']))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

Model Evaluation Metrics:

Accuracy Score: 0.9812

Classification Report:
                precision    recall  f1-score   support

Ham (Not Spam)       0.98      1.00      0.99       966
          Spam       0.99      0.87      0.92       149

      accuracy                           0.98      1115
     macro avg       0.99      0.93      0.96      1115
  weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[965   1]
 [ 20 129]]
