In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    words = nltk.word_tokenize(text)
    words = [w for w in words if w not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

df['cleaned_text'] = df['text'].apply(clean_text)

vectorizers = {
    "bow": CountVectorizer(),
    "tfidf": TfidfVectorizer()
}

In [None]:
for name, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform(df['text'])
    X_cleaned = vectorizer.fit_transform(df['cleaned_text'])
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train_cleaned, X_test_cleaned, _, _ = train_test_split(X_cleaned, y, test_size=0.2, random_state=42)

    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    for model_name, model in models.items():
        for data, X_tr, X_te in [("Original", X_train, X_test), ("Cleaned", X_train_cleaned, X_test_cleaned)]:
            model.fit(X_tr, y_train)
            y_pred = model.predict(X_te)
            acc = accuracy_score(y_test, y_pred)
            print(f"{model_name} - {name.upper()} ({data}): {acc:.4f}")

Naive Bayes - BOW (Original): 0.9785
Naive Bayes - BOW (Cleaned): 0.9686
Random Forest - BOW (Original): 0.9749
Random Forest - BOW (Cleaned): 0.9731


Parameters: { "use_label_encoder" } are not used.



XGBoost - BOW (Original): 0.9776


Parameters: { "use_label_encoder" } are not used.



XGBoost - BOW (Cleaned): 0.9722
Naive Bayes - TFIDF (Original): 0.9623
Naive Bayes - TFIDF (Cleaned): 0.9650
Random Forest - TFIDF (Original): 0.9767
Random Forest - TFIDF (Cleaned): 0.9704


Parameters: { "use_label_encoder" } are not used.



XGBoost - TFIDF (Original): 0.9821


Parameters: { "use_label_encoder" } are not used.



XGBoost - TFIDF (Cleaned): 0.9740


In [None]:
ensemble = VotingClassifier(
    estimators=[
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ], voting='soft')

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2, random_state=42)
ensemble.fit(X_train, y_train)
y_pred_ensemble = ensemble.predict(X_test)
ensemble_acc = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Accuracy: {ensemble_acc:.4f}")

Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy: 0.9740
