In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download stopwords if not already present
nltk.download('stopwords')

# Load Dataset
def load_data():
    df = pd.read_csv("https://raw.githubusercontent.com/Apaulgithub/oibsip_taskno4/main/spam.csv",
                    encoding='ISO-8859-1')
    df = df.iloc[:, :2]
    df.columns = ['label', 'message']
    return df

# Enhanced text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# Main function
def main():
    # Load and preprocess data
    df = load_data()
    df['cleaned_message'] = df['message'].apply(preprocess_text)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_message'],
        df['label'],
        test_size=0.2,
        random_state=42
    )

    # Enhanced pipeline with TF-IDF
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', MultinomialNB(alpha=0.1))
    ])

    # Train model
    pipeline.fit(X_train, y_train)

    # Evaluate model
    y_pred = pipeline.predict(X_test)
    # print("\n📊 Classification Report:")
    # print(classification_report(y_test, y_pred))

    # Interactive prediction
    while True:
        print("\n✉️ Email Spam Detector (Type 'exit' to quit)")
        email_text = input("Enter email text: ")
        if email_text.lower() == 'exit':
            break

        cleaned_text = preprocess_text(email_text)
        prediction = pipeline.predict([cleaned_text])[0]
        proba = pipeline.predict_proba([cleaned_text])[0]

        print("\n🔍 Results:")
        print(f"Prediction: {'SPAM 🚨' if prediction == 'spam' else 'HAM ✅'}")
        print(f"Confidence: {max(proba)*100:.2f}%")
        print(f"Spam Probability: {proba[1]*100:.2f}%")
        print(f"Ham Probability: {proba[0]*100:.2f}%")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



✉️ Email Spam Detector (Type 'exit' to quit)
Enter email text: Hi Francis! When will we arrive at the airport?

🔍 Results:
Prediction: HAM ✅
Confidence: 81.04%
Spam Probability: 18.96%
Ham Probability: 81.04%

✉️ Email Spam Detector (Type 'exit' to quit)
Enter email text: exit
