In [8]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [9]:
def print_separator(title):
    print("\n" + "=" * 70)
    print(f" {title}")
    print("=" * 70)


In [10]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class LogisticRegressionL2(BaseEstimator, ClassifierMixin):
    """
    From-scratch Logistic Regression (L2), GridSearchCV uyumlu.
    Standardization modelin içinde:
      - fit(): train fold mean/std
      - predict/predict_proba(): aynı mean/std ile transform
    """

    def __init__(self, learning_rate=0.01, n_iters=1000, lambda_param=0.1, standardize=True):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.lambda_param = lambda_param
        self.standardize = standardize

        self.weights_ = None
        self.bias_ = None
        self.mean_ = None
        self.std_ = None

    def _sigmoid(self, z):
        # overflow guard
        z = np.clip(z, -500, 500)
        return 1.0 / (1.0 + np.exp(-z))

    def _fit_standardizer(self, X):
        self.mean_ = X.mean(axis=0)
        self.std_ = X.std(axis=0)
        self.std_[self.std_ == 0] = 1.0

    def _transform_standardizer(self, X):
        return (X - self.mean_) / self.std_

    def _preprocess_X(self, X):
        X = np.asarray(X, dtype=float)
        if self.standardize and (self.mean_ is not None):
            X = self._transform_standardizer(X)
        return X

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y).ravel().astype(float)

        if self.standardize:
            self._fit_standardizer(X)
            X = self._transform_standardizer(X)

        n_samples, n_features = X.shape
        self.weights_ = np.zeros(n_features, dtype=float)
        self.bias_ = 0.0

        lr = float(self.learning_rate)
        lam = float(self.lambda_param)

        for _ in range(int(self.n_iters)):
            linear = X @ self.weights_ + self.bias_
            y_pred = self._sigmoid(linear)

            # L2 sadece weight'e (bias regularize edilmez)
            dw = (1.0 / n_samples) * (X.T @ (y_pred - y) + 2.0 * lam * self.weights_)
            db = (1.0 / n_samples) * np.sum(y_pred - y)

            self.weights_ -= lr * dw
            self.bias_ -= lr * db

        return self

    def predict_proba(self, X):
        X = self._preprocess_X(X)
        linear = X @ self.weights_ + self.bias_
        p1 = self._sigmoid(linear)
        p0 = 1.0 - p1
        return np.vstack([p0, p1]).T

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] > 0.5).astype(int)

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
    precision_recall_curve
)

# -----------------------------
# METRİK HESAPLAMA
# -----------------------------
def compute_metrics(y_true, y_pred, y_proba=None):
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
    }
    if y_proba is not None:
        if y_proba.ndim > 1:
            pos_proba = y_proba[:, 1]
        else:
            pos_proba = y_proba
        metrics["ROC-AUC"] = roc_auc_score(y_true, pos_proba)
    else:
        metrics["ROC-AUC"] = None
    return metrics


def print_metrics_table(metrics, title="Results"):
    print(f"\n{title}:")
    print("-" * 40)
    for k, v in metrics.items():
        if v is None:
            print(f"{k:12s}: None")
        else:
            print(f"{k:12s}: {v:.4f}")


# -----------------------------
# CONFUSION MATRIX HEATMAP
# -----------------------------
def plot_confusion_matrix_heatmap(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(5, 4))
    im = ax.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)

    classes = ["0", "1"]
    ax.set_xticks(np.arange(len(classes)))
    ax.set_yticks(np.arange(len(classes)))
    ax.set_xticklabels(classes)
    ax.set_yticklabels(classes)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title(title)

    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, format(cm[i, j], "d"),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
            )

    plt.tight_layout()
    plt.show()


# -----------------------------
# ROC CURVE
# -----------------------------
def plot_roc_curve_model(y_true, y_proba, title="ROC Curve"):
    if y_proba.ndim > 1:
        pos_proba = y_proba[:, 1]
    else:
        pos_proba = y_proba

    fpr, tpr, _ = roc_curve(y_true, pos_proba)
    auc = roc_auc_score(y_true, pos_proba)

    fig, ax = plt.subplots(figsize=(5, 4))
    ax.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
    ax.plot([0, 1], [0, 1], "k--", label="Random")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title(title)
    ax.legend(loc="lower right")
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()


# -----------------------------
# PRECISION–RECALL CURVE
# -----------------------------
def plot_precision_recall_curve_model(y_true, y_proba, title="Precision-Recall Curve"):
    if y_proba.ndim > 1:
        pos_proba = y_proba[:, 1]
    else:
        pos_proba = y_proba

    precision, recall, _ = precision_recall_curve(y_true, pos_proba)

    fig, ax = plt.subplots(figsize=(5, 4))
    ax.plot(recall, precision)
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.set_title(title)
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()


# -----------------------------
# METRİK BAR CHART
# -----------------------------
def plot_metrics_bar(metrics, title="Model Performance"):
    names = [k for k in metrics.keys() if metrics[k] is not None]
    values = [metrics[k] for k in names]

    fig, ax = plt.subplots(figsize=(6, 4))
    bars = ax.bar(names, values)
    ax.set_ylim(0, 1.05)
    ax.set_ylabel("Score")
    ax.set_title(title)
    ax.grid(axis="y", alpha=0.3)

    for bar in bars:
        h = bar.get_height()
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            h + 0.01,
            f"{h:.3f}",
            ha="center",
            va="bottom",
        )

    plt.tight_layout()
    plt.show()


In [12]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def run_logreg_experiment(X_train_df, X_test_df, y_train_ser, y_test_ser, dataset_name):
    print("\n" + "="*90)
    print(f"DATASET: {dataset_name}")
    print(f"Train size: {len(X_train_df)} | Test size: {len(X_test_df)}")
    print("="*90)

    X_train = X_train_df.to_numpy(dtype=float)
    X_test  = X_test_df.to_numpy(dtype=float)
    y_train = y_train_ser.to_numpy(dtype=int)
    y_test  = y_test_ser.to_numpy(dtype=int)

    cv_strat = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    rows = []

    # =====================================================
    # 1) CUSTOM (SCRATCH) GRID SEARCH (STRATIFIED)
    # =====================================================
    custom_param_grid = {
        "learning_rate": [0.1, 0.01, 0.001],
        "n_iters": [1000, 2500, 5000],
        "lambda_param": [0.0, 0.1, 1.0],
    }

    custom_base = LogisticRegressionL2(standardize=True)

    gs_custom = GridSearchCV(
        estimator=custom_base,
        param_grid=custom_param_grid,
        cv=cv_strat,              # <-- IMPORTANT
        scoring="accuracy",
        n_jobs=-1
    )
    gs_custom.fit(X_train, y_train)

    best_custom = gs_custom.best_estimator_
    y_pred_c = best_custom.predict(X_test)
    y_proba_c = best_custom.predict_proba(X_test)
    m_custom = compute_metrics(y_test, y_pred_c, y_proba_c)

    rows.append({
        "Dataset": dataset_name,
        "Model": "LogReg",
        "Impl": "From-Scratch",
        "CV_Acc": float(gs_custom.best_score_),
        "Test_Acc": float(m_custom["Accuracy"]),
        "Precision": float(m_custom["Precision"]),
        "Recall": float(m_custom["Recall"]),
        "F1": float(m_custom["F1-Score"]),
        "ROC-AUC": float(m_custom["ROC-AUC"]),
        "BestParams": str(gs_custom.best_params_)
    })

    print(f"Custom best CV Acc : {gs_custom.best_score_:.4f}")
    print(f"Custom best params : {gs_custom.best_params_}")
    print(f"Custom test  Acc   : {m_custom['Accuracy']:.4f}")

    # =====================================================
    # 2) SKLEARN PIPELINE GRID SEARCH (STRATIFIED)
    # =====================================================
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(random_state=42, penalty="l2"))
    ])

    sk_param_grid = {
        "clf__C": [0.01, 0.1, 1, 10, 100],
        "clf__solver": ["liblinear", "lbfgs"],
        "clf__max_iter": [100, 200],
    }

    gs_sk = GridSearchCV(
        estimator=pipe,
        param_grid=sk_param_grid,
        cv=cv_strat,              # <-- IMPORTANT
        scoring="accuracy",
        n_jobs=-1
    )
    gs_sk.fit(X_train, y_train)

    best_sk = gs_sk.best_estimator_
    y_pred_s = best_sk.predict(X_test)
    y_proba_s = best_sk.predict_proba(X_test)
    m_sk = compute_metrics(y_test, y_pred_s, y_proba_s)

    rows.append({
        "Dataset": dataset_name,
        "Model": "LogReg",
        "Impl": "Sklearn",
        "CV_Acc": float(gs_sk.best_score_),
        "Test_Acc": float(m_sk["Accuracy"]),
        "Precision": float(m_sk["Precision"]),
        "Recall": float(m_sk["Recall"]),
        "F1": float(m_sk["F1-Score"]),
        "ROC-AUC": float(m_sk["ROC-AUC"]),
        "BestParams": str(gs_sk.best_params_)
    })

    print(f"Sklearn best CV Acc: {gs_sk.best_score_:.4f}")
    print(f"Sklearn best params: {gs_sk.best_params_}")
    print(f"Sklearn test  Acc  : {m_sk['Accuracy']:.4f}")

    # =====================================================
    # 3) SUMMARY TABLE
    # =====================================================
    summary_df = pd.DataFrame(rows)
    summary_df = summary_df[[
        "Dataset", "Model", "Impl", "CV_Acc", "Test_Acc",
        "Precision", "Recall", "F1", "ROC-AUC", "BestParams"
    ]]

    print("\n" + "="*90)
    print("SUMMARY TABLE — " + dataset_name)
    print("="*90)
    print(summary_df.to_string(index=False))

    return summary_df

In [13]:
def main():
    # DATASET 1
    df1 = pd.read_csv("preprocessed_heart_disease_uci_unscaled.csv")
    X1 = df1.drop("target", axis=1)
    y1 = df1["target"]

    X1_train, X1_test, y1_train, y1_test = train_test_split(
        X1, y1, test_size=0.2, random_state=42, stratify=y1
    )

    res1 = run_logreg_experiment(X1_train, X1_test, y1_train, y1_test, dataset_name="UCI Heart Disease (unscaled)")

    # DATASET 2
    df2 = pd.read_csv("processed_heart_unscaled.csv")
    X2 = df2.drop("HeartDisease", axis=1)
    y2 = df2["HeartDisease"]

    X2_train, X2_test, y2_train, y2_test = train_test_split(
        X2, y2, test_size=0.2, random_state=42, stratify=y2
    )

    res2 = run_logreg_experiment(X2_train, X2_test, y2_train, y2_test, dataset_name="Heart Failure Kaggle (unscaled)")

    # FINAL MERGE
    final_df = pd.concat([res1, res2], ignore_index=True)

    print("\n" + "="*90)
    print("FINAL SUMMARY TABLE — LOGISTIC REGRESSION (BOTH DATASETS)")
    print("="*90)
    print(final_df.to_string(index=False))

if __name__ == "__main__":
    main()


DATASET: UCI Heart Disease (unscaled)
Train size: 736 | Test size: 184
Custom best CV Acc : 0.8207
Custom best params : {'lambda_param': 0.0, 'learning_rate': 0.1, 'n_iters': 2500}
Custom test  Acc   : 0.8370
Sklearn best CV Acc: 0.8207
Sklearn best params: {'clf__C': 10, 'clf__max_iter': 100, 'clf__solver': 'liblinear'}
Sklearn test  Acc  : 0.8370

SUMMARY TABLE — UCI Heart Disease (unscaled)
                     Dataset  Model         Impl   CV_Acc  Test_Acc  Precision   Recall   F1  ROC-AUC                                                       BestParams
UCI Heart Disease (unscaled) LogReg From-Scratch 0.820656  0.836957   0.867347 0.833333 0.85 0.917743     {'lambda_param': 0.0, 'learning_rate': 0.1, 'n_iters': 2500}
UCI Heart Disease (unscaled) LogReg      Sklearn 0.820656  0.836957   0.867347 0.833333 0.85 0.917982 {'clf__C': 10, 'clf__max_iter': 100, 'clf__solver': 'liblinear'}

DATASET: Heart Failure Kaggle (unscaled)
Train size: 734 | Test size: 184
Custom best CV Acc : 0.855