In [1]:
#import necessary libraries
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read Bankruptcy data 

In [2]:
bk_df=pd.read_csv('Bankruptcy_data_Final.csv')

In [None]:
bk_df.shape

In [None]:
bk_df.head()

In [None]:
bk_df.tail()

In [None]:
bk_df.info()


In [None]:
bk_df.describe().transpose()

In [None]:
bk_df.dtypes

## Missing Values


In [None]:
bk_df.isna().sum()

## Visualisations

In [None]:
import sweetviz as sv
report = sv.analyze(bk_df, target_feat='BK')
report.show_html('employee.html')

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(bk_df)
profile

## Data for ML Models: Test Train Split

In [3]:
X = bk_df.drop(['BK'], axis=1)
y = bk_df['BK']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=666)

In [5]:
X_train.shape

(83584, 12)

In [6]:
X_test.shape

(9288, 12)

## Data Processing Pipeline

## Resample: Adaptive Synthetic Sampling Approach for Imbalanced Learning

In [None]:
from imblearn.over_sampling import ADASYN

X_resampled_ad, y_resampled_ad = ADASYN(random_state=0).fit_resample(X_train_transformed, y_train)

X_resampled_ad.shape
y_resampled_ad.shape
np.bincount(y_resampled_ad)

X_train_transformed = X_resampled_ad
y_train = y_resampled_ad

## Resample: Under Sample

In [7]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
X_resampled_us, y_resampled_us = rus.fit_resample(X_train, y_train)

X_resampled_us.shape
y_resampled_us.shape
np.bincount(y_resampled_us)

X_train = X_resampled_us
y_train = y_resampled_us

## Resample: Over Sample

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)

X_resampled_os, y_resampled_os = ros.fit_resample(X_train, y_train)
X_resampled_os.shape
y_resampled_os.shape
np.bincount(y_resampled_os)

## Resample: SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

X_resampled_sm, y_resampled_sm = SMOTE(random_state=0).fit_resample(X_train_transformed, y_train)

X_resampled_sm.shape
y_resampled_sm.shape
np.bincount(y_resampled_sm)
X_train_transformed = X_resampled_sm
y_train = y_resampled_sm

## Resample: No change in Data

In [None]:
X_train.shape
y_train.shape

# Pipeline setup

In [8]:
# numeric features
numeric_features = X_train.select_dtypes(include='number').columns.tolist()
print(numeric_features)

['EPS', 'Liquidity', 'Profitability', 'Productivity', 'Leverage Ratio', 'Asset Turnover', 'Operational Margin', 'Return on Equity', 'Market Book Ratio', 'Assets Growth', 'Sales Growth', 'Employee Growth']


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, RobustScaler
from sklearn.pipeline import Pipeline

## Simple Imputer: Mean
## Scale: MinMaxScaler

In [13]:
# build pipeline for numeric features
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', RobustScaler())])

In [15]:
# fit numeric pipeline
numeric_pipeline.fit_transform(X_train.select_dtypes(include='number'))

array([[ 0.02630328, -0.46082414, -1.        , ..., -0.27770701,
        -0.07032349, -0.05604921],
       [ 0.09602282,  0.86244922, -1.5434418 , ...,  0.87922379,
         2.23107205,  0.84921729],
       [-0.09412138,  0.60708067,  0.32170413, ..., -0.07133758,
         0.03375527, -0.0861244 ],
       ...,
       [ 0.16003803,  0.22634939,  0.32103321, ...,  0.65987261,
         0.36005626, -1.0786056 ],
       [ 0.09665663, -7.9315148 , -5.74807112, ..., -1.86242038,
        -1.00703235,  0.84921729],
       [-0.47948027, -1.48926291, -1.58705133, ..., -1.3477707 ,
        -1.16455696,  0.84921729]])

In [16]:
# we can now use data_pipeline to transform X_train and X_test
X_train_transformed = numeric_pipeline.transform(X_train)
X_test_transformed = numeric_pipeline.transform(X_test)

In [17]:
X_train_transformed.shape, X_test_transformed.shape


((1004, 12), (9288, 12))

## Feature Selection

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.1));
sel = sel.fit(X_train_transformed);

X_train_transformed = sel.transform(X_train_transformed)
X_test_transformed = sel.transform(X_test_transformed)

X_test_transformed.shape

In [None]:
X_train_transformed.shape
X_train_transformed

## Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=666,class_weight="balanced",max_iter=10000,C=0.01, penalty="l1", solver="liblinear")
lr_clf.fit(X_train_transformed, y_train)
#{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}

LogisticRegression(C=0.01, class_weight='balanced', max_iter=10000,
                   penalty='l1', random_state=666, solver='liblinear')

In [45]:
#Predict on X_test
y_pred_lr = lr_clf.predict(X_test_transformed)
y_prob_lr = lr_clf.predict_proba(X_test_transformed)

In [46]:
lr_clf.coef_

array([[ 0.00152109, -0.00019106,  0.00060329,  0.00141035,  0.00107799,
         0.        ,  0.0006603 , -0.02204907, -0.00103465, -0.03421574,
         0.00352285,  0.        ]])

In [47]:
lr_clf.intercept_

array([0.])

In [42]:
from sklearn.model_selection import GridSearchCV
solvers = ["liblinear"] #,'newton-cg', 'lbfgs', "sag",'liblinear',
penalty = ['l1','l2','elasticnet'] #,'l1','elasticnet''l2','none'
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)


In [43]:
grid_search = GridSearchCV(estimator=lr_clf, param_grid=grid, n_jobs=-1, cv=10, scoring='f1_weighted',error_score=0)
grid_result = grid_search.fit(X_train_transformed, y_train)


In [35]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.752465 using {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.647693 (0.046520) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.596969 (0.082848) with: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
0.647693 (0.046520) with: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
0.595973 (0.082100) with: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
0.649791 (0.044828) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.595973 (0.082100) with: {'C': 1.0, 'penalty': 'l1', 'solver': 'saga'}
0.653035 (0.046468) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.598953 (0.083651) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
0.752465 (0.050079) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.587158 (0.083851) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}


In [None]:
# Best: 0.671202 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
#Best: 0.752465 using {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}


## Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(
    n_estimators=100, max_depth=20, max_leaf_nodes=50, min_samples_split=2, random_state=666,class_weight="balanced",criterion='entropy',max_features="auto")
clf_rf.fit(X_train_transformed, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=20, max_leaf_nodes=50, random_state=666)

In [72]:
clf_rf.feature_importances_

array([0.1120127 , 0.06893735, 0.06684539, 0.07143834, 0.07553918,
       0.05452884, 0.09378742, 0.20642315, 0.1032444 , 0.06775119,
       0.04578559, 0.03370646])

In [None]:
# values = sorted(zip(feature_names, clf_rf.feature_importances_), key=lambda x: x[1] * -1)
# values

In [74]:
y_pred_rf = clf_rf.predict(X_test_transformed)
y_prob_rf = clf_rf.predict_proba(X_test_transformed)

In [68]:
from sklearn.model_selection import GridSearchCV
n_estimatorss = [50,100,200,300,400,500,700,900,1000,1200]
criterions = ['gini','entropy'] #,'l1','elasticnet''l2','none'
max_depths = [1,2,5,10,20,None]
max_featuress = ["auto","sqrt","log2"]
max_leaf_nodess = [1,2,5,10,20,30,50,100,200]
grid = dict(n_estimators=n_estimatorss,criterion=criterions,max_depth=max_depths,max_features=max_featuress,max_leaf_nodes=max_leaf_nodess)


In [69]:
grid_search = GridSearchCV(estimator=clf_rf, param_grid=grid, n_jobs=-1, cv=10, scoring='f1_weighted',error_score=0)
grid_result = grid_search.fit(X_train_transformed, y_train)


In [70]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.866478 using {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'max_leaf_nodes': 50, 'n_estimators': 100}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'max_leaf_nodes': 1, 'n_estimators': 50}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'max_leaf_nodes': 1, 'n_estimators': 100}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'max_leaf_nodes': 1, 'n_estimators': 200}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'max_leaf_nodes': 1, 'n_estimators': 300}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'max_leaf_nodes': 1, 'n_estimators': 400}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'max_leaf_nodes': 1, 'n_estimators': 500}
0.000000 (0.000000) with: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'ma

0.851372 (0.027741) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 100, 'n_estimators': 900}
0.853360 (0.027609) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 100, 'n_estimators': 1000}
0.854382 (0.025722) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 100, 'n_estimators': 1200}
0.849529 (0.025995) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 200, 'n_estimators': 50}
0.851506 (0.023678) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 200, 'n_estimators': 100}
0.853395 (0.027128) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 200, 'n_estimators': 200}
0.856396 (0.027921) with: {'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'max_leaf_nodes': 200, 'n_estimators': 300}
0.853386 (0.028984) with: {'criterion': 'gini', 'max_depth': 30, 'ma

0.000000 (0.000000) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 1, 'n_estimators': 900}
0.000000 (0.000000) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 1, 'n_estimators': 1000}
0.000000 (0.000000) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 1, 'n_estimators': 1200}
0.777895 (0.027995) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 2, 'n_estimators': 50}
0.779959 (0.022744) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 2, 'n_estimators': 100}
0.776977 (0.024175) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 2, 'n_estimators': 200}
0.780987 (0.024252) with: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 'log2', 'max_leaf_nodes': 2, 'n_estimators': 300}
0.779989 (0.024066) with: {'criterion': 'entropy', 'max_depth': 2, '

0.779007 (0.023342) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 2, 'n_estimators': 700}
0.774809 (0.025338) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 2, 'n_estimators': 900}
0.773711 (0.023100) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 2, 'n_estimators': 1000}
0.774743 (0.023171) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 2, 'n_estimators': 1200}
0.811433 (0.023221) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'n_estimators': 50}
0.811458 (0.022202) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'n_estimators': 100}
0.810511 (0.026011) with: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'n_estimators': 200}
0.808491 (0.025119) with: {'criterion': 'entropy', 'max_depth

In [None]:
# Best: 0.851501 using {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': 25, 'n_estimators': 100}

# Best: 0.866478 using {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'max_leaf_nodes': 50, 'n_estimators': 100}

## Naive Bayes

In [75]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb = gnb.fit(X_train_transformed, y_train)


In [76]:
#Predict on X_Test
y_pred_nb = gnb.predict(X_test_transformed)


In [77]:
y_prob_nb = gnb.predict_proba(X_test_transformed)

In [78]:
from sklearn.model_selection import GridSearchCV
grid = dict( 'var_smoothing':np.logspace(0,-9, num=100))


SyntaxError: invalid syntax (Temp/ipykernel_14644/124123615.py, line 2)

In [None]:
grid_search = GridSearchCV(estimator=gnb, param_grid=grid, n_jobs=-1, cv=10, scoring='f1_weighted',error_score=0)
grid_result = grid_search.fit(X_train_transformed, y_train)


In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Model Parameters

In [None]:
gnb.theta_ # Mean of each feature per class
gnb.sigma_ # Variance of each feature per class

## Model Performance

In [79]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_lr)


array([[7101, 2131],
       [  20,   36]], dtype=int64)

In [80]:
confusion_matrix(y_test, y_pred_rf)


array([[7690, 1542],
       [   9,   47]], dtype=int64)

In [81]:
confusion_matrix(y_test, y_pred_nb)

array([[8880,  352],
       [  40,   16]], dtype=int64)

In [82]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       1.00      0.77      0.87      9232
           1       0.02      0.64      0.03        56

    accuracy                           0.77      9288
   macro avg       0.51      0.71      0.45      9288
weighted avg       0.99      0.77      0.86      9288

              precision    recall  f1-score   support

           0       1.00      0.83      0.91      9232
           1       0.03      0.84      0.06        56

    accuracy                           0.83      9288
   macro avg       0.51      0.84      0.48      9288
weighted avg       0.99      0.83      0.90      9288

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      9232
           1       0.04      0.29      0.08        56

    accuracy                           0.96      9288
   macro avg       0.52      0.62      0.53      9288
weighted avg       0.99      0.96      0.97      9288



In [83]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss

print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred_lr)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred_lr)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_lr)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred_lr)))


print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred_rf)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred_rf)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_rf)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred_rf)))


print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred_nb)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred_nb)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_nb)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred_nb)))

Accuracy = 0.77
Kappa = 0.02
F1 Score = 0.03
Log Loss = 8.00
Accuracy = 0.83
Kappa = 0.05
F1 Score = 0.06
Log Loss = 5.77
Accuracy = 0.96
Kappa = 0.07
F1 Score = 0.08
Log Loss = 1.46


In [84]:
# AUC plot
from sklearn.metrics import plot_roc_curve, roc_auc_score, f1_score
# plot_roc_curve(y_test,y_prob_lr[:,1])


In [85]:
roc_auc_score(y_test,y_prob_lr[:,1])


0.750907170710572

In [86]:
f1_score(y_test,y_pred_lr)

0.032388663967611336

In [87]:
# AUC plot
# plot_roc_curve(y_test,y_prob_rf[:,1])

In [88]:
roc_auc_score(y_test,y_prob_rf[:,1])

0.921337274077742

In [89]:
f1_score(y_test,y_pred_rf)

0.05714285714285715

In [None]:
# AUC plot
# plot_roc_curve(y_test,y_prob_nb[:,1])

In [90]:
roc_auc_score(y_test,y_prob_nb[:,1])

0.7849492835479078

In [91]:
f1_score(y_test,y_pred_nb)

0.07547169811320756