In [14]:
from data_loader import data_loader, preprocess
from PCA import fair_PCA
import xgboost as xgb
import sklearn.metrics as metrics
import pickle


one_hot_cols = ['Race_American_Indian_Alaska_Native', 'Race_Asian', 'Race_Black_African_American', 
                'Race_Native_Hawaiian_Pacific_Islander', 'Race_White', 'Race_White_Latino']
# filter columns to only include columns in the features list below
features = ['loan_amount_000s', 'loan_type', 'property_type','applicant_income_000s', 
            'hud_median_family_income', 'tract_to_msamd_income', 
            'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', #'race_ethnicity', 'joint_sex', "minority_population", 'purchaser_type', 
            'state_code', 'county_code', 'lien_status']

In [15]:
num_samples = 1_000_000
df = data_loader(one_hot_cols, num=num_samples)
x_train, x_val, x_test, y_train, y_val, y_test, train_groups, val_groups, test_groups = preprocess(df, features, one_hot_cols)

X_fair_PCA, U, explained_variance = fair_PCA(x_train, n_components=x_train.shape[1], groups=train_groups)
x_test_pca = x_test @ U

Loading data...
processed_data.csv exists. Loading data from file.
x_train: 70.00%
x_val: 15.00%
x_test: 15.00%
Num features BEFORE filtering features 54
Num features AFTER filtering features 11
x_train shape:  (395460, 11)
y_train shape:  (84742, 11)


In [9]:
# Calculate the class imbalance ratio
imbalance_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Set the parameters for XGBoost model
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.3,                       # Learning rate
    'max_depth': 6,                   # Maximum depth of each tree
    'min_child_weight': 1,            # Minimum sum of instance weight (Hessian) needed in a child
    'subsample': 0.8,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.8,          # Subsample ratio of columns when constructing each tree
    'scale_pos_weight': imbalance_ratio,  # Accounting for class imbalance
    'seed': 42                        # Random seed for reproducibility
}

In [10]:
def train_xgb(X, y, x_t, y_t, params, rounds):

    # Convert your data into DMatrix format
    dtrain = xgb.DMatrix(X, label=y)
    dtest = xgb.DMatrix(x_t, label=y_t)

    model = xgb.train(params, dtrain, rounds)

    # Make predictions on the test set
    y_pred = model.predict(dtest)

    # Convert probabilities to class labels
    y_pred_labels = (y_pred > 0.5).astype(int)

    # Calculate accuracy
    accuracy = metrics.accuracy_score(y_test, y_pred_labels)
    print("Accuracy: {:.2f}".format(accuracy))

    # Calculate f1 score
    f1_score = metrics.f1_score(y_test, y_pred_labels, average='weighted')
    print("F1 Score: {:.2f}".format(f1_score))

    # Calculate ROC AUC
    roc_auc = metrics.roc_auc_score(y_test, y_pred)
    print("ROC AUC: {:.2f}".format(roc_auc))

    # Generate classification report
    classification_report = metrics.classification_report(y_test, y_pred_labels)
    print("Classification Report:")
    print(classification_report)

    return model

In [12]:
model = train_xgb(x_train, y_train, x_test, y_test, params, rounds=100)
model_fair = train_xgb(X_fair_PCA, y_train, x_test_pca, y_test, params, rounds=100)

Accuracy: 0.72
F1 Score: 0.73
ROC AUC: 0.70
Classification Report:
              precision    recall  f1-score   support

           0       0.37      0.52      0.43     17455
           1       0.86      0.77      0.81     67287

    accuracy                           0.72     84742
   macro avg       0.61      0.64      0.62     84742
weighted avg       0.76      0.72      0.73     84742

Accuracy: 0.69
F1 Score: 0.70
ROC AUC: 0.63
Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.43      0.36     17455
           1       0.84      0.75      0.79     67287

    accuracy                           0.69     84742
   macro avg       0.57      0.59      0.58     84742
weighted avg       0.73      0.69      0.70     84742



In [13]:
# Save the trained model
with open('../models/xgboost_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../models/xgboost_model_fair.pkl', 'wb') as f:
    pickle.dump(model_fair, f)