# XGBoost Model

In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load datasets
train_df = pd.read_csv("../data/processed/train_set_SMOTEd.csv")
test_df = pd.read_csv("../data/processed/test_set.csv")

### XGBoost Model

In [3]:
# Define feature columns (all except 'id' and 'credit_status')
feature_columns = [col for col in train_df.columns if col not in ["credit_status", "id"]]

# Split features and target
X_train = train_df[feature_columns]
y_train = LabelEncoder().fit_transform(train_df["credit_status"])
X_test = test_df[feature_columns]
y_test = LabelEncoder().fit_transform(test_df["credit_status"])

# Initialize and train XGBoost classifier
xgb_model = XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train);

### Evaluation

In [4]:
# Predict on test set
y_pred = xgb_model.predict(X_test)

# Attach predictions to customer IDs
results_df = test_df[["id"]].copy()
results_df["predicted_credit_status"] = y_pred

# Display results
print(results_df.head())

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

        id  predicted_credit_status
0  5052720                        0
1  5087861                        1
2  5068206                        1
3  5137255                        1
4  5023163                        1
Accuracy: 0.8764399341744378

Confusion Matrix:
 [[ 125  516]
 [ 385 6266]]

Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.20      0.22       641
           1       0.92      0.94      0.93      6651

    accuracy                           0.88      7292
   macro avg       0.58      0.57      0.58      7292
weighted avg       0.86      0.88      0.87      7292

