In [43]:
import json
from collections.abc import *
import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from predictors import LightGBMClassifier, XGBoostClassifier

hparam_xgb_1 = "params/all_data/params_tuned_xgb_1.json"
hparam_lgb_1 = "params/all_data/params_tuned_lgbm_1.json"
hparam_xgb_2 = "params/dropped_data/params_tuned_xgb_1.json"
hparam_lgb_2 = "params/dropped_data/params_tuned_lgbm_1.json"


In [44]:
clean_data = pd.read_csv("clean_data.csv")
clean_data_targets = clean_data.pop("bankruptcy?")
X_train, X_test, y_train, y_test = train_test_split(
    clean_data,
    clean_data_targets,
    test_size=0.3,
    random_state=121,
    stratify=clean_data_targets,
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.67, random_state=121, stratify=y_test
)
clean_data = {}
clean_data["train"] = [X_train, y_train]
clean_data["val"] = [X_val, y_val]
clean_data["test"] = [X_test, y_test]


In [45]:
clean_data_dropped = pd.read_csv("clean_data_dropped.csv")
clean_data_dropped_targets = clean_data_dropped.pop("bankruptcy?")
X_train, X_test, y_train, y_test = train_test_split(
    clean_data_dropped,
    clean_data_dropped_targets,
    test_size=0.3,
    random_state=121,
    stratify=clean_data_dropped_targets,
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.67, random_state=121, stratify=y_test
)
clean_data_dropped = {}
clean_data_dropped["train"] = [X_train, y_train]
clean_data_dropped["val"] = [X_val, y_val]
clean_data_dropped["test"] = [X_test, y_test]


In [46]:
### Undroppped Cases
with open(hparam_xgb_1, "r") as f:
    hparams = json.load(f)
clf_1 = XGBoostClassifier(params=hparams, data=clean_data)
clf_1.fit(None, None)


[0]	train-aucpr:0.74005	validation-aucpr:0.38708
[9]	train-aucpr:0.94620	validation-aucpr:0.66106


In [47]:
print("+++++" * 10)
print(f" Training Metrics ".center(50))
print(clf_1.score(X=clean_data["train"][0], y=clean_data["train"][-1]))
print("+++++" * 10)
print(f" Validation Metrics ".center(50))
print(clf_1.score(X=clean_data["val"][0], y=clean_data["val"][-1]))
print("+++++" * 10)
print(f" Testdata Metrics ".center(50))
print(clf_1.score(X=clean_data["test"][0], y=clean_data["test"][-1]))
print("+++++" * 10)


++++++++++++++++++++++++++++++++++++++++++++++++++
                 Training Metrics                 
{'precision': 0.6236933797909407, 'recall': 0.9781420765027322, 'f1_score': 0.7617021276595745, 'auc_roc': 0.9966431196897423, 'accuracy': 0.9705095010312897, 'auc_pr': 0.9483308193441204, 'threshold': 0.50357145, 'tn': 21041, 'fp': 648, 'fn': 24, 'tp': 1074}
++++++++++++++++++++++++++++++++++++++++++++++++++
                Validation Metrics                
{'precision': 0.4476987447698745, 'recall': 0.6903225806451613, 'f1_score': 0.5431472081218274, 'auc_roc': 0.9365240804821355, 'accuracy': 0.9441340782122905, 'auc_pr': 0.6617283874246815, 'threshold': 0.50357145, 'tn': 2935, 'fp': 132, 'fn': 48, 'tp': 107}
++++++++++++++++++++++++++++++++++++++++++++++++++
                 Testdata Metrics                 
{'precision': 0.4161490683229814, 'recall': 0.638095238095238, 'f1_score': 0.5037593984962406, 'auc_roc': 0.9279639270488524, 'accuracy': 0.9394865525672371, 'auc_pr': 0.608598

In [48]:
### Undroppped Cases
with open(hparam_lgb_1, "r") as f:
    hparams = json.load(f)
clf_2 = LightGBMClassifier(params=hparams, data=clean_data)
clf_2.fit(None, None)


[LightGBM] [Info] Number of positive: 1098, number of negative: 21689
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 22787, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048185 -> initscore=-2.983315
[LightGBM] [Info] Start training from score -2.983315


In [49]:
from pprint import pprint
print("+++++" * 10)
pprint(f" Training Metrics ".center(50))
print(clf_2.score(X=clean_data["train"][0], y=clean_data["train"][-1]))
print("+++++" * 10)
pprint(f" Validation Metrics ".center(50))
print(clf_2.score(X=clean_data["val"][0], y=clean_data["val"][-1]))
print("+++++" * 10)
pprint(f" Testdata Metrics ".center(50))
print(clf_2.score(X=clean_data["test"][0], y=clean_data["test"][-1]))
print("+++++" * 10)
# Using CLF_2 for reporting all the metrics in writeup


++++++++++++++++++++++++++++++++++++++++++++++++++
'                 Training Metrics                 '
{'precision': 0.9891794409377818, 'recall': 0.9990892531876139, 'f1_score': 0.9941096511101043, 'auc_roc': 0.9992679886667472, 'auc_pr': 0.9883224335861542, 'accuracy': 0.999429499275903, 'threshold': 0.7751202946958485, 'tn': 21677, 'fp': 12, 'fn': 1, 'tp': 1097}
++++++++++++++++++++++++++++++++++++++++++++++++++
'                Validation Metrics                '
{'precision': 0.8857142857142857, 'recall': 0.6, 'f1_score': 0.7153846153846153, 'auc_roc': 0.7980436909031627, 'auc_pr': 0.5506712778221158, 'accuracy': 0.9770328988206083, 'threshold': 0.7751202946958485, 'tn': 3055, 'fp': 12, 'fn': 62, 'tp': 93}
++++++++++++++++++++++++++++++++++++++++++++++++++
'                 Testdata Metrics                 '
{'precision': 0.8732394366197183, 'recall': 0.5904761904761905, 'f1_score': 0.7045454545454547, 'auc_roc': 0.7930708131703477, 'auc_pr': 0.5353398098452165, 'accuracy': 0.976

In [16]:
lookup_table = "lookup_table.json"
with open(lookup_table, "r") as f:
    lookup = json.load(f)
importances = clf_2.booster.feature_importance()
feature_names = clean_data["train"][0].columns.to_list()
feature_importances = pd.DataFrame(
    {"feature": feature_names, "importance": importances}
)
feature_importances = feature_importances.sort_values(by="importance", ascending=False)
feature_importances["feature_name"] = feature_importances["feature"].map(lookup)


In [17]:
(feature_importances[:10]).values
np.save("top_10_gain_split", (feature_importances[:10]).values)


In [25]:
import shap

explainer = shap.KernelExplainer(
    clf_2.predict_proba, clean_data["train"][0].iloc[1], link="logit"
)
shap_values = explainer.shap_values(clean_data["test"][0].iloc[1])


In [30]:
# # create the Explanation object
# expl = shap.Explanation(
#     explainer.expected_value,  # the expected value
#     shap_values[1],  # the SHAP values for the positive class
#     feature_names=clean_data["test"][0].columns.values.tolist(),
# )

# # create the force plot
# fig = shap.force_plot(
#     explainer.expected_value,
#     expl,
#     clean_data["test"][0].iloc[1],
#     link="logit",
# )

# # show the plot
# plt.show(fig)


In [31]:
### Dropped Cases
with open(hparam_xgb_2, "r") as f:
    hparams = json.load(f)
clf_3 = XGBoostClassifier(params=hparams, data=clean_data_dropped)
clf_3.fit(None, None)


[0]	train-aucpr:0.49261	validation-aucpr:0.28779
[10]	train-aucpr:0.72683	validation-aucpr:0.35008


In [32]:
print("+++++" * 10)
print(f" Training Metrics ".center(50))
print(clf_3.score(X=clean_data_dropped["train"][0], y=clean_data_dropped["train"][-1]))
print("+++++" * 10)
print(f" Validation Metrics ".center(50))
print(clf_3.score(X=clean_data_dropped["val"][0], y=clean_data_dropped["val"][-1]))
print("+++++" * 10)
print(f" Testdata Metrics ".center(50))
print(clf_3.score(X=clean_data_dropped["test"][0], y=clean_data_dropped["test"][-1]))
print("+++++" * 10)


++++++++++++++++++++++++++++++++++++++++++++++++++
                 Training Metrics                 
{'precision': 0.21111734980348118, 'recall': 0.9567430025445293, 'f1_score': 0.34590616375344985, 'auc_roc': 0.9848074797977235, 'accuracy': 0.9168129168129168, 'auc_pr': 0.7422041670946449, 'threshold': 0.51107675, 'tn': 15296, 'fp': 1405, 'fn': 17, 'tp': 376}
++++++++++++++++++++++++++++++++++++++++++++++++++
                Validation Metrics                
{'precision': 0.145748987854251, 'recall': 0.6545454545454545, 'f1_score': 0.23841059602649003, 'auc_roc': 0.9169232545608498, 'accuracy': 0.9048407116259827, 'auc_pr': 0.3630735267388531, 'threshold': 0.51107675, 'tn': 2151, 'fp': 211, 'fn': 19, 'tp': 36}
++++++++++++++++++++++++++++++++++++++++++++++++++
                 Testdata Metrics                 
{'precision': 0.1594488188976378, 'recall': 0.7168141592920354, 'f1_score': 0.2608695652173913, 'auc_roc': 0.9299792827744477, 'accuracy': 0.9065173116089613, 'auc_pr': 0.4455

In [33]:
### Dropped Cases
with open(hparam_lgb_2, "r") as f:
    hparams = json.load(f)
clf_4 = LightGBMClassifier(params=hparams, data=clean_data_dropped)
clf_4.fit(None, None)


[LightGBM] [Info] Number of positive: 393, number of negative: 16701
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16065
[LightGBM] [Info] Number of data points in the train set: 17094, number of used features: 63
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.022991 -> initscore=-3.749414
[LightGBM] [Info] Start training from score -3.749414


In [34]:
print("+++++" * 10)
print(f" Training Metrics ".center(50))
print(clf_4.score(X=clean_data_dropped["train"][0], y=clean_data_dropped["train"][-1]))
print("+++++" * 10)
print(f" Validation Metrics ".center(50))
print(clf_4.score(X=clean_data_dropped["val"][0], y=clean_data_dropped["val"][-1]))
print("+++++" * 10)
print(f" Testdata Metrics ".center(50))
print(clf_4.score(X=clean_data_dropped["test"][0], y=clean_data_dropped["test"][-1]))
print("+++++" * 10)


++++++++++++++++++++++++++++++++++++++++++++++++++
                 Training Metrics                 
{'precision': 0.9333333333333333, 'recall': 0.9974554707379135, 'f1_score': 0.9643296432964329, 'auc_roc': 0.9978894622116609, 'auc_pr': 0.931016939413886, 'accuracy': 0.9983034983034983, 'threshold': 0.7599967838292062, 'tn': 16673, 'fp': 28, 'fn': 1, 'tp': 392}
++++++++++++++++++++++++++++++++++++++++++++++++++
                Validation Metrics                
{'precision': 0.8235294117647058, 'recall': 0.509090909090909, 'f1_score': 0.6292134831460674, 'auc_roc': 0.7532753444692479, 'auc_pr': 0.4304222098814325, 'accuracy': 0.9863467107985106, 'threshold': 0.7599967838292062, 'tn': 2356, 'fp': 6, 'fn': 27, 'tp': 28}
++++++++++++++++++++++++++++++++++++++++++++++++++
                 Testdata Metrics                 
{'precision': 0.7352941176470589, 'recall': 0.4424778761061947, 'f1_score': 0.5524861878453039, 'auc_roc': 0.7193627654452174, 'auc_pr': 0.33818233671999165, 'accuracy'

### Other Models explored
all on Dropped data


In [35]:
from sklearn.metrics import (
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)


def score(X, y, clf):
    y_pred_proba = clf.predict_proba(X)[:, 1]
    fpr, tpr, thresholds = roc_curve(y, y_pred_proba)
    threshold = thresholds[np.argmax(tpr - fpr)]
    auc_roc = roc_auc_score(y, y_pred_proba)
    auc_pr = average_precision_score(y, y_pred_proba)
    # use threshold value to predict the class labels,
    y_pred = (y_pred_proba > threshold).astype(int)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    metrics_dict = {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc_roc": auc_roc,
        "auc_pr": auc_pr,
        "threshold": threshold,
    }
    return metrics_dict


In [36]:
# from sklearn.svm import SVC
# from pprint import pprint

# clf_5 = SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=121)
# clf_5.fit(clean_data_dropped["train"][0], clean_data_dropped["train"][-1])
# pprint(
#     score(X=clean_data_dropped["val"][0], y=clean_data_dropped["val"][-1], clf=clf_5)
# )


### Smote

In [37]:
clean_data_dropped = pd.read_csv("clean_data_dropped.csv")
clean_data_dropped_targets = clean_data_dropped.pop("bankruptcy?")
X = clean_data_dropped.values
y = clean_data_dropped_targets.values
from imblearn.combine import SMOTETomek
from collections import Counter

print(Counter(y))
smt = SMOTETomek(random_state=121, n_jobs=-1)
X_res, y_res = smt.fit_resample(X, y)


Counter({0: 23860, 1: 561})


In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res,
    y_res,
    test_size=0.3,
    random_state=121,
    stratify=y_res,
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.67, random_state=121, stratify=y_test
)
clean_data_dropped_smt = {}
clean_data_dropped_smt["train"] = [X_train, y_train]
clean_data_dropped_smt["val"] = [X_val, y_val]
clean_data_dropped_smt["test"] = [X_test, y_test]


In [39]:
with open(hparam_xgb_1, "r") as f:
    hparams = json.load(f)
clf_6 = XGBoostClassifier(params=hparams, data=clean_data_dropped_smt)
clf_6.fit(None, None)


[0]	train-aucpr:0.96107	validation-aucpr:0.94154
[10]	train-aucpr:0.98542	validation-aucpr:0.97199


In [40]:
print("+++++" * 10)
print(f" Training Metrics ".center(50))
print(
    clf_6.score(
        X=clean_data_dropped_smt["train"][0], y=clean_data_dropped_smt["train"][-1]
    )
)
print("+++++" * 10)
print(f" Validation Metrics ".center(50))
print(
    clf_6.score(X=clean_data_dropped_smt["val"][0], y=clean_data_dropped_smt["val"][-1])
)
print("+++++" * 10)
print(f" Testdata Metrics ".center(50))
print(
    clf_6.score(
        X=clean_data_dropped_smt["test"][0], y=clean_data_dropped_smt["test"][-1]
    )
)
print("+++++" * 10)


++++++++++++++++++++++++++++++++++++++++++++++++++
                 Training Metrics                 
{'precision': 0.9449264883104362, 'recall': 0.9565694766377943, 'f1_score': 0.95071233707184, 'auc_roc': 0.9876072149987062, 'accuracy': 0.9504086861046724, 'auc_pr': 0.9853465880580294, 'threshold': 0.5240496, 'tn': 15480, 'fp': 914, 'fn': 712, 'tp': 15682}
++++++++++++++++++++++++++++++++++++++++++++++++++
                Validation Metrics                
{'precision': 0.9222972972972973, 'recall': 0.9417852522639069, 'f1_score': 0.9319394068700662, 'auc_roc': 0.9775591476942733, 'accuracy': 0.931205520810869, 'auc_pr': 0.9719291448615959, 'threshold': 0.5240496, 'tn': 2134, 'fp': 184, 'fn': 135, 'tp': 2184}
++++++++++++++++++++++++++++++++++++++++++++++++++
                 Testdata Metrics                 
{'precision': 0.9263752353064213, 'recall': 0.940939026981092, 'f1_score': 0.9336003372681282, 'auc_roc': 0.9790291362725736, 'accuracy': 0.9330855018587361, 'auc_pr': 0.9756088

In [41]:
clean_data = pd.read_csv("clean_data.csv")


In [None]:
import os
import seaborn as sns


def make_faceted_plots(df):
    if not os.path.exists("faceted_plots"):
        os.mkdir("faceted_plots")

    # loop through all columns and pick 8 columns
    num_cols = df.shape[1]
    for i in range(num_cols):
        if i + 8 < num_cols:
            cols = df.columns[i : i + 8]
            cols = cols.tolist() + ["bankruptcy?"]
            sns.pairplot(df[cols], hue="bankruptcy?", corner=True)
            plot = sns.pairplot(df[cols], corner=True)
            plot.savefig(f"faceted_plots/facet_{i}.png")


In [None]:
# make_faceted_plots(clean_data)
