In [1]:
import pandas as pd
from io import StringIO

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [2]:
data = """"text","spam"
"Subject: naturally irresistible your corporate identity lt is really hard to recollect a company ... 100 % satisfaction guaranteed ...",1
"""

df = pd.read_csv(StringIO(data))
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1


In [3]:
X = df['text']
y = df['spam']

vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [4]:
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [5]:
StratifiedKFold(n_splits=5)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [6]:
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [7]:
# Check minimum samples per class
min_class_count = y.value_counts().min()

if min_class_count >= 2:
    n_splits = min(5, min_class_count)
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
else:
    kfold = None
    print("Not enough samples per class for cross-validation.")


Not enough samples per class for cross-validation.


In [8]:
if kfold is not None:
    log_scores = cross_val_score(log_model, X_vectorized, y, cv=kfold, scoring='accuracy')
    rf_scores = cross_val_score(rf_model, X_vectorized, y, cv=kfold, scoring='accuracy')

    print("Logistic Regression CV Accuracy:", log_scores)
    print("Random Forest CV Accuracy:", rf_scores)

In [9]:
if kfold is not None:
    print("Average Logistic Regression Accuracy:", log_scores.mean())
    print("Average Random Forest Accuracy:", rf_scores.mean())

    print("Logistic Regression Std Dev:", log_scores.std())
    print("Random Forest Std Dev:", rf_scores.std())