In [None]:


# Step 1: Install dependencies (if not already available in Colab)
!pip install nltk

# Step 2: Import libraries
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

# Step 3: Load Dataset
# Dataset link: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
# You can also upload a CSV file in Colab
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep="\t", header=None, names=["label", "message"])

print("Dataset Shape:", df.shape)
print(df.head())

# Step 4: Preprocess Data
def clean_text(msg):
    msg = msg.lower()  # lowercase
    msg = "".join([c for c in msg if c not in string.punctuation])  # remove punctuation
    return msg

df["message"] = df["message"].apply(clean_text)

# Step 5: Split Data
X = df["message"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Vectorize (TF-IDF)
vectorizer = TfidfVectorizer(stop_words=stopwords.words("english"))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 7: Train Model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Step 8: Evaluate
y_pred = model.predict(X_test_vec)

print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 9: Test Custom Messages
test_msgs = [
    "Congratulations! You've won a $1000 Walmart gift card. Click here to claim now!",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your account has been suspended. Verify immediately."
]

test_vec = vectorizer.transform(test_msgs)
predictions = model.predict(test_vec)

for msg, pred in zip(test_msgs, predictions):
    print(f"\nMessage: {msg}\nPrediction: {pred}")