In [3]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
from torch.utils.data import DataLoader, TensorDataset

from get_dataset import X
from get_dataset import y


In [4]:

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train XGBoost model directly
print("Training standard XGBoost model...")
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Step 3: Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 4: For PyTorch integration, we can use XGBoost's output as features for a PyTorch model
# or use PyTorch's DataLoader for handling the data going into XGBoost

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Option 1: Use XGBoost with PyTorch DataLoader
print("\nTraining XGBoost with PyTorch DataLoader...")
def train_xgb_with_pytorch_loader(train_loader):
    # Collect all batches
    all_X = []
    all_y = []
    for X_batch, y_batch in train_loader:
        all_X.append(X_batch.numpy())
        all_y.append(y_batch.numpy())
    
    # Concatenate batches
    X_train_combined = np.vstack(all_X)
    y_train_combined = np.concatenate(all_y)
    
    # Train XGBoost
    xgb_model_from_loader = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4,
        objective='binary:logistic',
        random_state=42
    )
    xgb_model_from_loader.fit(X_train_combined, y_train_combined)
    return xgb_model_from_loader

xgb_model_pytorch = train_xgb_with_pytorch_loader(train_loader)

# Option 2: Define a simple PyTorch model that uses XGBoost features
class LoanDefaultClassifier(torch.nn.Module):
    def __init__(self, input_size):
        super(LoanDefaultClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Extract feature importance from XGBoost
print("\nFeature Importance from XGBoost:")
feature_importance = xgb_model.feature_importances_
for i, importance in enumerate(feature_importance):
    print(f"Feature {i}: {importance:.4f}")

# Function to evaluate both models on test data
def evaluate_models(xgb_model, test_loader):
    # Evaluate XGBoost
    X_test_np = X_test
    y_test_np = y_test
    y_pred_xgb = xgb_model.predict(X_test_np)
    xgb_accuracy = accuracy_score(y_test_np, y_pred_xgb)
    print(f"\nXGBoost Final Test Accuracy: {xgb_accuracy:.4f}")
    
evaluate_models(xgb_model, test_loader)

# Additional code for hyperparameter tuning (optional)
def hyperparameter_tuning():
    from sklearn.model_selection import GridSearchCV
    
    param_grid = {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
    
    print("\nPerforming hyperparameter tuning (this may take some time)...")
    grid_search = GridSearchCV(
        estimator=xgb.XGBClassifier(objective='binary:logistic', random_state=42),
        param_grid=param_grid,
        scoring='accuracy',
        cv=3,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best accuracy: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# Uncomment to run hyperparameter tuning
# best_xgb_model = hyperparameter_tuning()

Training standard XGBoost model...
XGBoost Model Accuracy: 0.9034

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7952
           1       0.66      0.42      0.51      1091

    accuracy                           0.90      9043
   macro avg       0.79      0.69      0.73      9043
weighted avg       0.89      0.90      0.89      9043


Confusion Matrix:
[[7715  237]
 [ 637  454]]

Training XGBoost with PyTorch DataLoader...

Feature Importance from XGBoost:
Feature 0: 0.0375
Feature 1: 0.0124
Feature 2: 0.0156
Feature 3: 0.0237
Feature 4: 0.0103
Feature 5: 0.0174
Feature 6: 0.1305
Feature 7: 0.0626
Feature 8: 0.0987
Feature 9: 0.0209
Feature 10: 0.0592
Feature 11: 0.1951
Feature 12: 0.0279
Feature 13: 0.0491
Feature 14: 0.0665
Feature 15: 0.1725

XGBoost Final Test Accuracy: 0.9034
