In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
import nltk

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_text(text):
    """
    Preprocess text by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Tokenizing
    4. Removing stopwords
    5. Stemming
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# Load dataset (replace with your own dataset)
# Sample dataset format: two columns - 'text' and 'label' (0=ham, 1=spam)
data = pd.DataFrame({
    'text': [
        "Free money now!!! Click here to claim your prize",
        "Hi John, let's meet for lunch tomorrow",
        "You've won a free vacation! Claim now",
        "Meeting reminder: Project discussion at 3pm",
        "Urgent! Your account has been compromised"
    ],
    'label': [1, 0, 1, 0, 1]
})

# Preprocess all texts
data['processed_text'] = data['text'].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['processed_text'],
    data['label'],
    test_size=0.2,
    random_state=42
)

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate model
print("Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Example prediction function
def predict_spam(text):
    processed_text = preprocess_text(text)
    vector = tfidf.transform([processed_text])
    prediction = classifier.predict(vector)
    return "SPAM" if prediction[0] == 1 else "HAM"

# Test with new examples
test_messages = [
    "Congratulations! You've won a million dollars!",
    "Hey, can we reschedule our meeting to tomorrow?",
    "Click this link to get free bitcoin now!!!"
]

print("\nPredictions for test messages:")
for msg in test_messages:
    print(f"Message: {msg}")
    print(f"Prediction: {predict_spam(msg)}\n")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Model Evaluation:
Accuracy: 0.00

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


Predictions for test messages:
Message: Congratulations! You've won a million dollars!
Prediction: SPAM

Message: Hey, can we reschedule our meeting to tomorrow?
Prediction: SPAM

Message: Click this link to get free bitcoin now!!!
Prediction: SPAM



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
