In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_auc_score

In [3]:
#1 Create a simple Dataset
data = {'email_length': [100, 200, 50, 300, 80, 500, 20, 400],
        'num_links': [1, 3, 0, 5, 0, 8, 0, 6],
        'is_spam': [0, 1, 0, 1, 0, 1, 0, 1]}

df = pd.DataFrame(data)

In [4]:
# 2. Split Data into Features (X) and Labels (y)

x = df[['email_length','num_links']]
y = df['is_spam']

In [5]:
# 3. Split Data into Training and Testing Sets

_train , x_test , y_train , y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [8]:
# 4. Feature Scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 5. Hyperparameter Tuning with GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

# 6. Cross-Validation
cross_val_scores = cross_val_score(best_model, x_train, y_train, cv=5)
print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.2f}")

# 7. Make Predictions
predictions = best_model.predict(x_test)

# 8. Evaluate the Model
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC-ROC: {auc:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)


ValueError: n_splits=5 cannot be greater than the number of members in each class.