In [1]:
import pandas as pd
from io import StringIO

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [2]:
df = pd.read_csv("emails.csv")
print(df.shape)
print(df.head())

(5728, 2)
                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [4]:
X = df['text']
y = df['spam']

vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [5]:
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
StratifiedKFold(n_splits=5)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [7]:
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [8]:
# Check minimum samples per class
min_class_count = y.value_counts().min()

if min_class_count >= 2:
    n_splits = min(5, min_class_count)
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
else:
    kfold = None
    print("Not enough samples per class for cross-validation.")

In [9]:
if kfold is not None:
    log_scores = cross_val_score(log_model, X_vectorized, y, cv=kfold, scoring='accuracy')
    rf_scores = cross_val_score(rf_model, X_vectorized, y, cv=kfold, scoring='accuracy')

    print("Logistic Regression CV Accuracy:", log_scores)
    print("Random Forest CV Accuracy:", rf_scores)

Logistic Regression CV Accuracy: [0.9895288  0.9921466  0.98516579 0.99126638 0.9930131 ]
Random Forest CV Accuracy: [0.98342059 0.98691099 0.97207679 0.98427948 0.98689956]


In [10]:
if kfold is not None:
    print("Average Logistic Regression Accuracy:", log_scores.mean())
    print("Average Random Forest Accuracy:", rf_scores.mean())

    print("Logistic Regression Std Dev:", log_scores.std())
    print("Random Forest Std Dev:", rf_scores.std())

Average Logistic Regression Accuracy: 0.9902241325438016
Average Random Forest Accuracy: 0.9827174832529322
Logistic Regression Std Dev: 0.0027795957529365496
Random Forest Std Dev: 0.0054997071809778295
