In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Keep only relevant columns and rename them
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})

# Prepare labels (convert ham/spam to 0/1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

df.head()


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [4]:
# Simplified text cleaning function
def clean_text_simple(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and special characters
    words = text.split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return ' '.join(words)

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text_simple)

df.head()


Unnamed: 0,label,text,cleaned_text
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives


In [5]:
def train_models_with_bow(text_column):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df[text_column])
    y = df['label']

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train models
    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy

    return results

# Train with raw text (BoW)
results_raw = train_models_with_bow('text')

# Train with cleaned text (BoW)
results_cleaned = train_models_with_bow('cleaned_text')

print("Results with Raw Text (BoW):", results_raw)
print("Results with Cleaned Text (BoW):", results_cleaned)


Results with Raw Text (BoW): {'Naive Bayes': 0.97847533632287, 'Random Forest': 0.9748878923766816}
Results with Cleaned Text (BoW): {'Naive Bayes': 0.9704035874439462, 'Random Forest': 0.9713004484304932}


In [6]:
def train_models_with_tfidf(text_column):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df[text_column])
    y = df['label']

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train models
    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy

    return results

# Train with raw text (TF-IDF)
results_raw_tfidf = train_models_with_tfidf('text')

# Train with cleaned text (TF-IDF)
results_cleaned_tfidf = train_models_with_tfidf('cleaned_text')

print("Results with Raw Text (TF-IDF):", results_raw_tfidf)
print("Results with Cleaned Text (TF-IDF):", results_cleaned_tfidf)


Results with Raw Text (TF-IDF): {'Naive Bayes': 0.9623318385650225, 'Random Forest': 0.9766816143497757}
Results with Cleaned Text (TF-IDF): {'Naive Bayes': 0.968609865470852, 'Random Forest': 0.968609865470852}


In [7]:
def train_ensemble_with_tfidf(text_column):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df[text_column])
    y = df['label']

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define base models
    nb = MultinomialNB()
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Voting classifier (soft voting for probability-based averaging)
    ensemble = VotingClassifier(estimators=[('nb', nb), ('rf', rf)], voting='soft')

    # Train and evaluate
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Train ensemble with raw text (TF-IDF)
ensemble_accuracy_raw = train_ensemble_with_tfidf('text')

# Train ensemble with cleaned text (TF-IDF)
ensemble_accuracy_cleaned = train_ensemble_with_tfidf('cleaned_text')

print("Ensemble Accuracy with Raw Text (TF-IDF):", ensemble_accuracy_raw)
print("Ensemble Accuracy with Cleaned Text (TF-IDF):", ensemble_accuracy_cleaned)


Ensemble Accuracy with Raw Text (TF-IDF): 0.9713004484304932
Ensemble Accuracy with Cleaned Text (TF-IDF): 0.968609865470852
