In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn import metrics
import imblearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

In [2]:
train = pd.read_csv("FDIC_train_new")
val = pd.read_csv("FDIC_val_new")
test = pd.read_csv("FDIC_test_new")

In [3]:
train['Failure'].value_counts()

0    8512
1     326
Name: Failure, dtype: int64

In [4]:
val['Failure'].value_counts()

0    2091
1      66
Name: Failure, dtype: int64

In [5]:
test['Failure'].value_counts()

0    2708
1      70
Name: Failure, dtype: int64

In [6]:
print(val.shape, test.shape)

(2157, 90) (2778, 90)


In [7]:
train.drop(['Unnamed: 0','total_assets_1', 'total_assets_2', 'total_assets_3', 'total_assets_4', 
           'max_total_assets_2','max_total_assets_3','max_total_assets_4'], axis=1, inplace=True)
val.drop(['Unnamed: 0','total_assets_1', 'total_assets_2', 'total_assets_3', 'total_assets_4', 
           'max_total_assets_2','max_total_assets_3','max_total_assets_4'], axis=1, inplace=True)
test.drop(['Unnamed: 0','total_assets_1', 'total_assets_2', 'total_assets_3', 'total_assets_4', 
           'max_total_assets_2','max_total_assets_3','max_total_assets_4'], axis=1, inplace=True)

In [8]:
y_train = train['Failure']
X_train = train.drop('Failure', axis=1)
y_val = val['Failure']
X_val = val.drop('Failure', axis=1)
y_test = test['Failure']
X_test = test.drop('Failure', axis=1)

In [9]:
X_train['log_max_total_assets'] = np.log1p(X_train['max_total_assets_1'])
X_train.drop('max_total_assets_1',axis=1,inplace=True)
X_val['log_max_total_assets'] = np.log1p(X_val['max_total_assets_1'])
X_val.drop('max_total_assets_1',axis=1,inplace=True)
X_test['log_max_total_assets'] = np.log1p(X_test['max_total_assets_1'])
X_test.drop('max_total_assets_1',axis=1,inplace=True)

In [10]:
def feature_generate(df, col_name_list):
    for col_name in col_name_list:
        df[col_name+'_change_1'] = df[col_name+'_2'] - df[col_name+'_1']
        df[col_name+'_change_2'] = df[col_name+'_3'] - df[col_name+'_2']
        df[col_name+'_change_3'] = df[col_name+'_4'] - df[col_name+'_3']
    return df

In [11]:
col_name_list = ['log_TA', 'NI_to_TA', 'Equity_to_TA', 'NPL_to_TL',
       'REO_to_TA', 'ALLL_to_TL', 'core_deposits_to_TA',
       'brokered_deposits_to_TA', 'liquid_assets_to_TA',
       'loss_provision_to_TL', 'ROA', 'NIM', 'assets_growth',
       'term_spread', 'stock_mkt_growth', 'real_gdp_growth',
       'unemployment_rate_change', 'treasury_yield_3m', 'bbb_spread']

In [12]:
X_train = feature_generate(X_train, col_name_list)
X_val = feature_generate(X_val, col_name_list)
X_test = feature_generate(X_test, col_name_list)

In [13]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(8838, 135)
(2157, 135)
(2778, 135)


In [14]:
print(y_train.shape)

(8838,)


In [15]:
train_data = pd.concat([X_train, X_val])
train_label = pd.concat([y_train, y_val])

In [16]:
print(train_label.shape)

(10995,)


#### Logistic Regression

In [17]:
lr_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid={'C': [0.0001,0.001,0.01, 0.1, 1]}, 
                   scoring='roc_auc')

lr_search.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1]},
             scoring='roc_auc')

In [18]:
print('best roc_auc score: ', lr_search.best_score_)
print('best parameters: ', lr_search.best_params_)

best roc_auc score:  0.951530908624018
best parameters:  {'C': 0.1}


In [19]:
lr = LogisticRegression(C=0.1,max_iter=1000)
lr.fit(train_data,train_label)

LogisticRegression(C=0.1, max_iter=1000)

In [20]:
lr_pred = lr.predict(X_test)
lr_pred_proba = pd.DataFrame(lr.predict_proba(X_test))[1]

In [21]:
threshold_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
def print_report(Y_true, model_pred_proba):
    for i in threshold_list:
        print ('\n******** For i = {} ******'.format(i))
        Y_test_pred = model_pred_proba.apply(lambda x: 1 if x>i else 0)
        print('Precision: ',metrics.precision_score(Y_true,Y_test_pred))
        print('Recall: ',metrics.recall_score(Y_true, Y_test_pred))
        print('F2_score: ', metrics.fbeta_score(Y_true, Y_test_pred,beta=2))
        print('ROC_AUC_Score: ',metrics.roc_auc_score(Y_true, model_pred_proba))

In [22]:
print_report(y_test,lr_pred_proba)


******** For i = 0.1 ******
Precision:  0.10534351145038168
Recall:  0.9857142857142858
F2_score:  0.3689839572192513
ROC_AUC_Score:  0.9234754167545896

******** For i = 0.2 ******
Precision:  0.13008130081300814
Recall:  0.9142857142857143
F2_score:  0.4145077720207255
ROC_AUC_Score:  0.9234754167545896

******** For i = 0.3 ******
Precision:  0.15748031496062992
Recall:  0.8571428571428571
F2_score:  0.453857791225416
ROC_AUC_Score:  0.9234754167545896

******** For i = 0.4 ******
Precision:  0.16356877323420074
Recall:  0.6285714285714286
F2_score:  0.40072859744990896
ROC_AUC_Score:  0.9234754167545896

******** For i = 0.5 ******
Precision:  0.15577889447236182
Recall:  0.44285714285714284
F2_score:  0.3235908141962422
ROC_AUC_Score:  0.9234754167545896

******** For i = 0.6 ******
Precision:  0.16417910447761194
Recall:  0.3142857142857143
F2_score:  0.26570048309178745
ROC_AUC_Score:  0.9234754167545896

******** For i = 0.7 ******
Precision:  0.16470588235294117
Recall:  0.2


In [23]:
for coef, column in sorted(zip(lr.coef_.ravel(), train_data.columns),key=lambda x: abs(x[0]), reverse=True):
    print(coef, column)

-1.088481690999683 Equity_to_TA_4
-0.6068158957891905 Equity_to_TA_3
-0.508544040057473 log_max_total_assets
-0.48725153842209223 treasury_yield_3m_4
-0.4816657952104987 Equity_to_TA_change_3
-0.4536721915431829 Equity_to_TA_2
-0.37869132370308056 treasury_yield_3m_3
-0.36224384835495776 term_spread_1
-0.31249538531891374 treasury_yield_3m_2
-0.3091878875967473 Equity_to_TA_1
0.2934714855566843 unemployment_rate_change_change_2
0.2865946433833895 bbb_spread_1
0.236859724199774 unemployment_rate_change_3
0.21658538872773883 term_spread_change_1
0.1975519749073387 bbb_spread_change_4
-0.19296222983079828 bbb_spread_change_1
-0.1919195842813695 term_spread_4
0.18163449825801406 core_deposits_to_TA_change_1
0.17676886825085036 log_TA_2
0.17093148449315332 brokered_deposits_to_TA_2
0.17027194018368222 brokered_deposits_to_TA_change_1
0.16998104630201494 log_TA_3
-0.16959867591890465 liquid_assets_to_TA_change_1
0.1681809440420293 log_TA_4
0.16761650302419612 real_gdp_growth_1
-0.16544610120

#### RandomForest

In [24]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]


random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 325, 550, 775, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 32, 55, 77, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [25]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='roc_auc')

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 24.9min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 32, 55, 77, 100,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 325, 550, 775,
                                                         1000]},
                   random_state=42, scoring='roc_auc', verbose=2)

In [26]:
rf_random.best_params_

{'n_estimators': 550,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': False}

In [27]:
rf_final = RandomForestClassifier(n_estimators=775, min_samples_split=5, min_samples_leaf=4, 
                                  max_features='auto', max_depth=10, bootstrap=False)

In [28]:
rf_final.fit(train_data, train_label)

RandomForestClassifier(bootstrap=False, max_depth=10, min_samples_leaf=4,
                       min_samples_split=5, n_estimators=775)

In [29]:
rf_pred_test = rf_final.predict(X_test)
rf_pred_proba_test = pd.DataFrame(rf_final.predict_proba(X_test))[1]

In [30]:
print("Report for Random Forest Final Model on Test Data")
print_report(y_test,rf_pred_proba_test)

Report for Random Forest Final Model on Test Data

******** For i = 0.1 ******
Precision:  0.08518518518518518
Recall:  0.9857142857142858
F2_score:  0.3165137614678899
ROC_AUC_Score:  0.9327759020890483

******** For i = 0.2 ******
Precision:  0.12035398230088495
Recall:  0.9714285714285714
F2_score:  0.4023668639053255
ROC_AUC_Score:  0.9327759020890483

******** For i = 0.3 ******
Precision:  0.15346534653465346
Recall:  0.8857142857142857
F2_score:  0.4532163742690058
ROC_AUC_Score:  0.9327759020890483

******** For i = 0.4 ******
Precision:  0.18421052631578946
Recall:  0.7
F2_score:  0.44871794871794873
ROC_AUC_Score:  0.9327759020890483

******** For i = 0.5 ******
Precision:  0.19858156028368795
Recall:  0.4
F2_score:  0.332541567695962
ROC_AUC_Score:  0.9327759020890483

******** For i = 0.6 ******
Precision:  0.020833333333333332
Recall:  0.014285714285714285
F2_score:  0.01524390243902439
ROC_AUC_Score:  0.9327759020890483

******** For i = 0.7 ******
Precision:  0.0
Recall:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
for coef, column in sorted(zip(rf_final.feature_importances_, X_train.columns),key=lambda x: abs(x[0]), reverse=True):
    print(coef, column)

0.10155945523430723 Equity_to_TA_4
0.05687504378996016 Equity_to_TA_3
0.030686310448525572 Equity_to_TA_2
0.030065531034050352 ROA_4
0.027341142229312308 NI_to_TA_4
0.027282755942275354 NPL_to_TL_4
0.023764522454107113 NPL_to_TL_3
0.0215219467044593 treasury_yield_3m_4
0.018699084088355692 Equity_to_TA_1
0.017689406344817818 ALLL_to_TL_4
0.017118001133351955 treasury_yield_3m_3
0.01409248945919774 treasury_yield_3m_2
0.013533585701933687 NPL_to_TL_2
0.013505467501027615 ALLL_to_TL_3
0.01252618423531105 ROA_2
0.012335141607271956 ALLL_to_TL_2
0.012325296259076402 loss_provision_to_TL_4
0.011892232700722191 NPL_to_TL_1
0.011850503001459857 NI_to_TA_3
0.01113384076964865 NI_to_TA_2
0.011097601479439117 NI_to_TA_change_3
0.01099141864324462 Equity_to_TA_change_3
0.010953785861873563 ROA_3
0.010932646059833672 treasury_yield_3m_1
0.010881914999764496 ROA_change_3
0.010293693619529585 NIM_4
0.008663387978803007 NIM_change_3
0.008659438989987334 core_deposits_to_TA_change_3
0.0083260450819820

#### Gradient Boost

In [32]:
gb_final = GradientBoostingClassifier(n_estimators=50,
                                    learning_rate=0.1)

gb_final.fit(train_data, train_label)

GradientBoostingClassifier(n_estimators=50)

In [33]:
gb_pred_test = gb_final.predict(X_test)
gb_pred_proba_test = pd.DataFrame(gb_final.predict_proba(X_test))[1]

print("Report for Gradient Boosting Final Model on Test Data")
print_report(y_test,gb_pred_proba_test)

Report for Gradient Boosting Final Model on Test Data

******** For i = 0.1 ******
Precision:  0.11837455830388692
Recall:  0.9571428571428572
F2_score:  0.3959810874704492
ROC_AUC_Score:  0.9251846381093058

******** For i = 0.2 ******
Precision:  0.14732142857142858
Recall:  0.9428571428571428
F2_score:  0.45329670329670335
ROC_AUC_Score:  0.9251846381093058

******** For i = 0.3 ******
Precision:  0.15466666666666667
Recall:  0.8285714285714286
F2_score:  0.4427480916030535
ROC_AUC_Score:  0.9251846381093058

******** For i = 0.4 ******
Precision:  0.1625
Recall:  0.7428571428571429
F2_score:  0.4333333333333334
ROC_AUC_Score:  0.9251846381093058

******** For i = 0.5 ******
Precision:  0.17054263565891473
Recall:  0.6285714285714286
F2_score:  0.40892193308550184
ROC_AUC_Score:  0.9251846381093058

******** For i = 0.6 ******
Precision:  0.1717171717171717
Recall:  0.4857142857142857
F2_score:  0.35564853556485354
ROC_AUC_Score:  0.9251846381093058

******** For i = 0.7 ******
Prec

#### Ensemble - Stacking

In [34]:
estimators = [
        ('rf_clf', RandomForestClassifier(n_estimators=200, random_state=42)),
        ('lr_clf', LogisticRegression(max_iter=200))
        ]

stacking_final = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(max_iter=200)
)

stacking_final.fit(train_data, train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

StackingClassifier(estimators=[('rf_clf',
                                RandomForestClassifier(n_estimators=200,
                                                       random_state=42)),
                               ('lr_clf', LogisticRegression(max_iter=200))],
                   final_estimator=LogisticRegression(max_iter=200))

In [35]:
stacking_test_proba = pd.DataFrame(stacking_final.predict_proba(X_test))[1]

print("Report for Final Stacking Model on Test Data")
print_report(y_test,stacking_test_proba)

Report for Final Stacking Model on Test Data

******** For i = 0.1 ******
Precision:  0.13883299798792756
Recall:  0.9857142857142858
F2_score:  0.44401544401544396
ROC_AUC_Score:  0.9406414855454737

******** For i = 0.2 ******
Precision:  0.16986301369863013
Recall:  0.8857142857142857
F2_score:  0.48062015503875966
ROC_AUC_Score:  0.9406414855454737

******** For i = 0.3 ******
Precision:  0.19215686274509805
Recall:  0.7
F2_score:  0.4579439252336448
ROC_AUC_Score:  0.9406414855454737

******** For i = 0.4 ******
Precision:  0.19672131147540983
Recall:  0.5142857142857142
F2_score:  0.3887688984881209
ROC_AUC_Score:  0.9406414855454737

******** For i = 0.5 ******
Precision:  0.22764227642276422
Recall:  0.4
F2_score:  0.3473945409429281
ROC_AUC_Score:  0.9406414855454737

******** For i = 0.6 ******
Precision:  0.1875
Recall:  0.17142857142857143
F2_score:  0.1744186046511628
ROC_AUC_Score:  0.9406414855454737

******** For i = 0.7 ******
Precision:  0.04
Recall:  0.01428571428571

  _warn_prf(average, modifier, msg_start, len(result))
