<a href="https://colab.research.google.com/github/amanjaiswalofficial/machine-learning-engineer-projects/blob/main/HOML/05_Model_selection_using_kfold_and_stratified_kfold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [3]:
data = load_breast_cancer()
X, y = data.data, data.target
print(f"Dataset shape: {X.shape}, Target classes: {np.bincount(y)}")

Dataset shape: (569, 30), Target classes: [212 357]


In [13]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
model_configs = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            "classifier__C": [0.01, 0.1, 1, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            "classifier__n_estimators": [50, 100],
            "classifier__max_depth": [None, 10, 20]
        }
    },
    "SVC": {
        "model": SVC(),
        "params": {
            "classifier__C": [0.1, 1, 10],
            "classifier__kernel": ["linear", "rbf"]
        }
    }
}

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

best_model = None
best_score = 0
best_model_name = ""
best_params = None

# Grid search across all models
for model_name, config in model_configs.items():
    print(f"\n🔍 Running GridSearchCV for {model_name}")

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", config["model"])
    ])

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=config["params"],
        scoring="accuracy",
        cv=cv,
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    print(f"✅ Best Accuracy (CV): {grid.best_score_:.4f}")
    print(f"🏆 Best Params: {grid.best_params_}")

    if grid.best_score_ > best_score:
        best_model = grid.best_estimator_
        best_score = grid.best_score_
        best_model_name = model_name
        best_params = grid.best_params_

# Final Model Evaluation on Test Set
print(f"\n🎯 Using Best Model: {best_model_name}")
y_pred = best_model.predict(X_test)

print(f"\n✅ Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n🧾 Classification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))


🔍 Running GridSearchCV for Logistic Regression
✅ Best Accuracy (CV): 0.9802
🏆 Best Params: {'classifier__C': 0.1}

🔍 Running GridSearchCV for Random Forest
✅ Best Accuracy (CV): 0.9582
🏆 Best Params: {'classifier__max_depth': 20, 'classifier__n_estimators': 100}

🔍 Running GridSearchCV for SVC
✅ Best Accuracy (CV): 0.9736
🏆 Best Params: {'classifier__C': 0.1, 'classifier__kernel': 'linear'}

🎯 Using Best Model: Logistic Regression

✅ Test Accuracy: 0.9737

📊 Confusion Matrix:
[[40  2]
 [ 1 71]]

🧾 Classification Report:
              precision    recall  f1-score   support

   malignant       0.98      0.95      0.96        42
      benign       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

