In [None]:
import pandas as pd
import numpy as np
import time

In [None]:
df = pd.read_csv('/kaggle/input/german-credit-cleaned/processed_german_credit_data.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC


In [None]:

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier


In [None]:
X = df.drop('class', axis=1)
y = df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.metrics import confusion_matrix

def run_models(X_train, X_test, y_train, y_test):
    """
    Trains multiple classifiers on the given dataset and returns:
    - results_df: main performance metrics
    - confusion_df: TP, TN, FP, FN for each model
    """

    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "NaiveBayes": GaussianNB(),
        "KNN": KNeighborsClassifier(),
        "DecisionTree": DecisionTreeClassifier(),
        "RandomForest": RandomForestClassifier(n_estimators=100),
        "SVM_Linear": SVC(kernel='linear', probability=True),
        "SVM_RBF": SVC(kernel='rbf', probability=True),
        "Bagging_SVM": BaggingClassifier(estimator=SVC(probability=True), n_estimators=10),
        "ExtraTree": ExtraTreeClassifier(),
        "AdaBoost": AdaBoostClassifier(n_estimators=100),
        "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "LightGBM": lgb.LGBMClassifier(),
        "CatBoost": CatBoostClassifier(verbose=0)
    }

    results_list = []
    confusion_list = []

    for name, model in models.items():
        start_time = time.time()
        model.fit(X_train, y_train)
        elapsed_time = time.time() - start_time

        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

        # Confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        results_list.append({
            "Model": name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred, zero_division=0),
            "Recall": recall_score(y_test, y_pred, zero_division=0),
            "F1_Score": f1_score(y_test, y_pred, zero_division=0),
            "ROC_AUC": roc_auc_score(y_test, y_proba),
            "Gini": 2 * roc_auc_score(y_test, y_proba) - 1,
            "Time_sec": elapsed_time
        })

        confusion_list.append({
            "Model": name,
            "True_Positive": tp,
            "True_Negative": tn,
            "False_Positive": fp,
            "False_Negative": fn
        })

    results_df = pd.DataFrame(results_list)
    confusion_df = pd.DataFrame(confusion_list).set_index("Model")

    return results_df, confusion_df


In [None]:
# Save original column names
original_columns = X_train.columns.tolist()

# Replace problematic characters with underscores
X_train.columns = X_train.columns.str.replace(r'[<>=\[\]]', '_', regex=True)
X_test.columns = X_test.columns.str.replace(r'[<>=\[\]]', '_', regex=True)

print(X_train.columns)


In [None]:
metrics_df, conf_matrix_df = run_models(X_train, X_test, y_train, y_test)

In [None]:
metrics_df

In [None]:
metrics_df.to_csv('metrics.csv', index = False)

In [None]:
conf_matrix_df.to_csv('conf_matrices.csv')

In [None]:
conf_matrix_df

The best choice based on the results, is `CatBoost` and `Bagging_SVM`. However, the speed is much faster in `Bagging_SVM`, therefore, I will choose it as a best model for classification.

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.utils.fixes import loguniform
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [None]:
# Helper function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    elapsed = time.time() - start

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba)
    gini = 2 * roc - 1

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "roc_auc": roc,
        "gini": gini,
        "time_sec": elapsed
    }



In [None]:
# --- 2. Base model ---
base_model = BaggingClassifier(
    estimator=SVC(probability=True),
    n_estimators=10,
    random_state=42
)

base_results = evaluate_model(base_model, X_train, X_test, y_train, y_test)

In [None]:

#-- Grid Search ---
param_grid = {
    "estimator__C": [0.1, 1, 10],
    "estimator__kernel": ["linear", "rbf"],
    "estimator__gamma": ["scale", "auto"],
    "n_estimators": [5, 10, 20]
}

grid_search = GridSearchCV(base_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

grid_results = evaluate_model(grid_search.best_estimator_, X_train, X_test, y_train, y_test)


In [None]:
# --- Random Search ---
param_dist = {
    "estimator__C": loguniform(1e-2, 1e2),
    "estimator__kernel": ["linear", "rbf"],
    "estimator__gamma": ["scale", "auto"],
    "n_estimators": np.arange(5, 31, 5)
}

random_search = RandomizedSearchCV(base_model, param_dist, n_iter=20, cv=3, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

random_results = evaluate_model(random_search.best_estimator_, X_train, X_test, y_train, y_test)


In [None]:
# --- 5. Bayesian Optimization (skopt) ---
bayes_search = BayesSearchCV(
    base_model,
    {
        "estimator__C": Real(1e-2, 1e2, prior='log-uniform'),
        "estimator__kernel": Categorical(['linear', 'rbf']),
        "n_estimators": Integer(5, 30)
    },
    n_iter=20,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42
)

bayes_search.fit(X_train, y_train)
bayes_results = evaluate_model(bayes_search.best_estimator_, X_train, X_test, y_train, y_test)


In [None]:
# --- Hyper Gradient Descent (manual tuning loop) ---
best_score = -np.inf
best_model = None
for C in [0.01, 0.1, 1, 10, 100]:
    for gamma in ['scale', 'auto']:
        model = BaggingClassifier(
            estimator=SVC(C=C, gamma=gamma, kernel='rbf', probability=True),
            n_estimators=10,
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, y_pred)
        if score > best_score:
            best_score = score
            best_model = model

hypergrad_results = evaluate_model(best_model, X_train, X_test, y_train, y_test)


In [None]:
# --- Collect all results ---
tuning_methods_df = pd.DataFrame.from_dict({
    "bagging_svm": base_results,
    "grid_search_tuned": grid_results,
    "random_search_tuned": random_results,
    "bayes_opt_tuned": bayes_results,
    "hypergrad_tuned": hypergrad_results
}, orient="index")

In [None]:
tuning_methods_df

In [None]:
# Save the tuning results DataFrame to CSV
tuning_methods_df.to_csv("tuning_methods.csv")

## Effect of learning rate on the performance

In [None]:
import time
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define 10 learning rates (mostly between 0 and 1)
learning_rates = [0.001, 0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0]

results = []

for lr in learning_rates:
    start_time = time.time()
    
    # Initialize CatBoost model
    model = CatBoostClassifier(
        learning_rate=lr,
        iterations=500,
        depth=6,
        eval_metric='AUC',
        random_state=42,
        verbose=0
    )
    
    # Train
    model.fit(X_train, y_train)
    elapsed_time = time.time() - start_time
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba)
    gini = 2 * roc - 1

    results.append({
        "LearningRate": lr,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1_Score": f1,
        "ROC_AUC": roc,
        "Gini": gini,
        "Time_sec": elapsed_time
    })

# Convert to DataFrame
perf_by_lr_df = pd.DataFrame(results)

perf_by_lr_df


In [None]:
perf_by_lr_df

In [None]:
perf_by_lr_df.to_csv('perf_by_lr.csv')

## Effect of number of estimators on performance

In [None]:
# Different numbers of estimators to test
n_estimators_list = [50, 80, 100, 200, 300, 400, 500, 800, 1000]

results = []

for n in n_estimators_list:
    start_time = time.time()
    
    # Initialize CatBoost with current number of iterations
    model = CatBoostClassifier(
        iterations=n,
        learning_rate=0.1,  # keep learning rate fixed
        depth=6,
        eval_metric='AUC',
        random_state=42,
        verbose=0
    )
    
    # Train
    model.fit(X_train, y_train)
    elapsed_time = time.time() - start_time
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba)
    gini = 2 * roc - 1

    results.append({
        "n_estimators": n,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1_Score": f1,
        "ROC_AUC": roc,
        "Gini": gini,
        "Time_sec": elapsed_time
    })

# Convert to DataFrame
perf_by_nestimators_df = pd.DataFrame(results)


In [None]:
# Save to CSV
perf_by_nestimators_df.to_csv("perf_by_nestimators.csv")