# Black Box Model

In [16]:
# Add parent directory to Python path to import src modules
import  sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

## XGBoost

For a black box surrogate model, we use an XGBoost model:

In [17]:
from xgboost import XGBClassifier

First get the data:

In [18]:
from src.preprocessing import get_data

df_oh, prob, predictions, true_labels = get_data()

In [19]:
df_oh.head()

Unnamed: 0,Pct_afro_american,annual_inc,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,dti,fico_range_high,funded_amnt,inq_last_6mths,...,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year
0,-0.4572,-0.740512,-0.745082,0.749479,-1.586997,-0.370384,-1.880644,2.10613,-1.180039,1.439967,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.261925,-0.470965,-0.250508,0.416937,-0.242629,-0.370384,0.530276,1.258044,1.180387,-0.69472,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.444418,4.254853,2.390279,0.248935,0.226833,-0.370384,0.019227,1.766896,1.281169,0.372623,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.522693,0.841762,1.367198,-0.426078,1.147974,-0.370384,-0.633779,0.749193,-0.416216,0.372623,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.484517,-0.15591,0.630499,-0.342077,0.497129,0.729341,0.561034,-0.946979,1.238734,0.372623,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
rename_columns = {"emp_length_< 1 year": "emp_length_less_1_year"}
df_oh = df_oh.rename(columns=rename_columns)

Split the data

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_oh, true_labels, test_size=0.2, random_state=42)

Train the model

In [53]:
parameter_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'scale_pos_weight': [1, 2, 3.5, 5],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

In [50]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score

In [54]:
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

In [56]:
# Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metric (F1 for imbalanced classes)
scoring = make_scorer(f1_score)


random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=parameter_grid,
    n_iter=50,                # number of random combinations to try
    scoring=scoring,
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best params:", random_search.best_params_)
print("Best F1:", random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


KeyboardInterrupt: 

In [None]:
model = XGBClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    n_jobs=-1,
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=3.5
)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [46]:
predictions = model.predict(X_test)

In [47]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.68      0.77    171711
           1       0.34      0.63      0.45     45537

    accuracy                           0.67    217248
   macro avg       0.61      0.66      0.61    217248
weighted avg       0.76      0.67      0.70    217248

