In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [29]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("loan_preprocess_dataset.csv")

# Ensure target column is clean (remove extra spaces if any)
df['loan_status'] = df['loan_status'].str.strip()


# Split features and target
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

# Convert categorical target to numeric (0/1)
y = y.map({"Rejected": 0, "Approved": 1})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Working on Selected Model's

In [30]:
# 1. Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

# 2. Define k-Fold CV (Stratified to preserve class balance)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 3. Define Metrics
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "f1": make_scorer(f1_score)
}

# 4. Run CV for Each Model
results = {}
for name, model in models.items():
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    results[name] = {metric: np.mean(scores[f'test_{metric}']) for metric in scoring.keys()}


results_df = pd.DataFrame(results).T
print("\n📊 Cross-Validation Results (5-Fold):\n")
print(results_df.round(4))



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.upd


📊 Cross-Validation Results (5-Fold):

                     accuracy  precision  recall      f1
Logistic Regression    0.9154     0.9325  0.9319  0.9321
Random Forest          0.9803     0.9809  0.9877  0.9843
XGBoost                0.9847     0.9847  0.9908  0.9877


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# Hyper-parameter Tuning Optuna

# Tuning Logistic Regression

In [31]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Suggest values for hyperparameters
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet", None])
    max_iter = trial.suggest_int("max_iter", 100, 2000)
    C = trial.suggest_float("C", 1e-3, 10.0, log=True)

    # Solver depends on penalty
    if penalty == "l1":
        solver = "liblinear"
    elif penalty == "elasticnet":
        solver = "saga"
    elif penalty is None:
        solver = "lbfgs"
    else:
        solver = "lbfgs"

    # l1_ratio only if elasticnet
    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = trial.suggest_float("l1_ratio", 0.0, 1.0)

    # Logistic Regression model (no scaler since data is already scaled)
    model = LogisticRegression(
        penalty=penalty,
        solver=solver,
        C=C,
        l1_ratio=l1_ratio,
        max_iter=max_iter,
        random_state=42
    )

    # Cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
    return score

# Optuna study
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)

print("Best trial accuracy:", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)


[I 2025-08-24 19:37:12,111] A new study created in memory with name: no-name-4263a6b9-be0c-493f-9d70-ead1e6c40843
[I 2025-08-24 19:37:12,734] Trial 0 finished with value: 0.6321164092984373 and parameters: {'penalty': 'elasticnet', 'max_iter': 341, 'C': 3.0760946264885773, 'l1_ratio': 0.5403839239107414}. Best is trial 0 with value: 0.6321164092984373.
[I 2025-08-24 19:37:14,350] Trial 1 finished with value: 0.6470966084275437 and parameters: {'penalty': 'elasticnet', 'max_iter': 873, 'C': 0.1687711317913671, 'l1_ratio': 0.416795615844022}. Best is trial 1 with value: 0.6470966084275437.
[I 2025-08-24 19:37:14,382] Trial 2 finished with value: 0.7840081034902536 and parameters: {'penalty': 'l1', 'max_iter': 1637, 'C': 0.0016331808516922292}. Best is trial 2 with value: 0.7840081034902536.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also ref

Best trial accuracy: 0.9179136475186428
Best hyperparameters: {'penalty': 'l2', 'max_iter': 1720, 'C': 0.07183378842320998}


# Random Forest 

In [32]:


# Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)  # number of trees
    max_depth = trial.suggest_int("max_depth", 2, 30)  # tree depth

    # Define model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1  # use all cores
    )

    # Perform CV accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
    return score

# Run Optuna
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)

# Print best result
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")


[I 2025-08-24 19:37:40,750] A new study created in memory with name: no-name-3821ffd4-5b78-47ae-b6b1-f2476c6c2092
[I 2025-08-24 19:37:41,557] Trial 0 finished with value: 0.9718378935665485 and parameters: {'n_estimators': 101, 'max_depth': 7}. Best is trial 0 with value: 0.9718378935665485.
[I 2025-08-24 19:37:42,812] Trial 1 finished with value: 0.975135525003178 and parameters: {'n_estimators': 180, 'max_depth': 14}. Best is trial 1 with value: 0.975135525003178.
[I 2025-08-24 19:37:43,605] Trial 2 finished with value: 0.9754347465423887 and parameters: {'n_estimators': 108, 'max_depth': 22}. Best is trial 2 with value: 0.9754347465423887.
[I 2025-08-24 19:37:47,149] Trial 3 finished with value: 0.9763332191389745 and parameters: {'n_estimators': 483, 'max_depth': 17}. Best is trial 3 with value: 0.9763332191389745.
[I 2025-08-24 19:37:48,809] Trial 4 finished with value: 0.9763334884652926 and parameters: {'n_estimators': 228, 'max_depth': 13}. Best is trial 4 with value: 0.9763334

Best trial accuracy: 0.9766329793308212
Best hyperparameters: {'n_estimators': 217, 'max_depth': 30}


# XGboost

In [33]:


# Define objective function for Optuna
def objective(trial):
    # Suggest values
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)

    # Define model
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )

    # Cross-validation score (3-fold, accuracy)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy").mean()
    return score

# Run optimization
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)

# Print results
print("Best trial accuracy:", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)


[I 2025-08-24 19:38:32,750] A new study created in memory with name: no-name-337aac6e-4ede-4d35-8db8-e810772b39bb
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-24 19:38:33,482] Trial 0 finished with value: 0.9769319315437137 and parameters: {'n_estimators': 275, 'max_depth': 3, 'learning_rate': 0.04646313565412953}. Best is trial 0 with value: 0.9769319315437137.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-08-24 19:38:34,149] Trial 1 finished with value: 0.9793275891416248 and parameters: {'n_est

Best trial accuracy: 0.981125611640068
Best hyperparameters: {'n_estimators': 454, 'max_depth': 5, 'learning_rate': 0.248815576263874}
