In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import RFE
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
print(xgb.__version__)

2.1.2


In [None]:
# Load data
data = pd.read_csv("Loan_default.csv")
data

# Preprocess the data
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = pd.factorize(data[col])[0]

data.columns = data.columns.str.lower().str.replace(' ', '_')
data.columns = data.columns.str.replace('-', '_').str.replace('.', '_')
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [None]:
# Normalize numeric columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

In [None]:
# Split into features and target
X = data.drop('default', axis=1)
y = data['default']

In [None]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
#selector = RFE(LogisticRegression(), n_features_to_select=10, step=1)
#X_train_rfe = selector.fit_transform(X_train_rfe, y_train_res)
#X_test_rfe = selector.transform(X_test)

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=200,                # Set number of estimators
    learning_rate=0.2,               # Set learning rate
    verbosity=2,                      # Set verbosity level for logs
    random_state=42,                  # For reproducibility
    early_stopping_rounds=10,         # Early stopping to prevent overfitting
    eval_metric='logloss'             # Evaluation metric
)

#random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   #n_iter=10, cv=3, scoring='roc_auc', n_jobs=-1, verbose=2, random_state=42)

#random_search.fit(X_train, y_train)

# Print the best hyperparameters found
#print("Best hyperparameters:", random_search.best_params_)

#best_xgb_model = random_search.best_estimator_

xgb_model.fit(
    X_train, y_train,                # Train with the training set
    eval_set=[(X_test, y_test)],         # Evaluation set for early stopping               # Evaluation metric
    verbose=True,                                   # Show training progress
)

[0]	validation_0-logloss:0.65903
[1]	validation_0-logloss:0.63504
[2]	validation_0-logloss:0.61593
[3]	validation_0-logloss:0.59821
[4]	validation_0-logloss:0.57941
[5]	validation_0-logloss:0.56434
[6]	validation_0-logloss:0.55161
[7]	validation_0-logloss:0.53581
[8]	validation_0-logloss:0.51529
[9]	validation_0-logloss:0.49582
[10]	validation_0-logloss:0.47928
[11]	validation_0-logloss:0.46708
[12]	validation_0-logloss:0.45349
[13]	validation_0-logloss:0.44599
[14]	validation_0-logloss:0.43876
[15]	validation_0-logloss:0.42814
[16]	validation_0-logloss:0.41940
[17]	validation_0-logloss:0.41391
[18]	validation_0-logloss:0.40464
[19]	validation_0-logloss:0.40086
[20]	validation_0-logloss:0.39841
[21]	validation_0-logloss:0.39435
[22]	validation_0-logloss:0.38887
[23]	validation_0-logloss:0.38265
[24]	validation_0-logloss:0.38014
[25]	validation_0-logloss:0.37551
[26]	validation_0-logloss:0.37294
[27]	validation_0-logloss:0.36867
[28]	validation_0-logloss:0.36584
[29]	validation_0-loglos

In [None]:
# After fitting the model, make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Optionally, get the predicted probabilities (for metrics like AUC)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # Probability for the positive class

# Evaluate model performance

# 1. Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# 2. AUC-ROC (Area Under the Receiver Operating Characteristic Curve)
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC: {auc:.4f}")

# 3. F1 Score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

# 4. Confusion Matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# 5. Classification Report (includes precision, recall, F1-score)
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.8855
AUC-ROC: 0.7485
F1 Score: 0.1436
Confusion Matrix:
[[44734   405]
 [ 5441   490]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94     45139
         1.0       0.55      0.08      0.14      5931

    accuracy                           0.89     51070
   macro avg       0.72      0.54      0.54     51070
weighted avg       0.85      0.89      0.85     51070



NameError: name 'custom_oversample' is not defined