In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report

In [3]:
# 1. Load Dataset
df = pd.read_csv("spam.csv", encoding="latin-1")

In [4]:
# Clean Dataset
df = df[["label", "message"]]

df.columns = df.columns.str.lower().str.strip()

df = df.dropna(subset=["label", "message"])

In [5]:
# Encode target: spam=1, ham=0
df["label"] = df["label"].map({"spam": 1, "ham": 0})

In [6]:
X = df["message"]
y = df["label"]

In [7]:
# 2. Train-Test Split (Stratified)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [8]:
# 3. Define Models (Text-Optimized)

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=300, class_weight="balanced", random_state=42
    ),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(class_weight="balanced", random_state=42)
}

In [9]:
# 4. Model Comparison (Cross-Validation)

print("Model Comparison (CV Accuracy)\n")

for name, model in models.items():
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=5000
        )),
        ("model", model)
    ])

    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring="accuracy"
    )

    print(f"{name} → Mean Accuracy: {scores.mean():.4f}")

Model Comparison (CV Accuracy)

Logistic Regression → Mean Accuracy: 0.9816
Multinomial Naive Bayes → Mean Accuracy: 0.9706
Linear SVM → Mean Accuracy: 0.9856


In [13]:
# 5. Final Evaluation (Best Model - Linear SVM)
# --------------------------------------------------
final_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        max_features=5000
    )),
    ("model", LinearSVC(
        class_weight="balanced",
        random_state=42
    ))
])

In [14]:
final_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [15]:
y_pred = final_pipeline.predict(X_test)

print("\nFinal Model Evaluation (Linear SVM)\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Final Model Evaluation (Linear SVM)

Accuracy: 0.9829596412556054

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.97      0.90      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

