In [7]:

import os
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import norm
from collections import Counter
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action="ignore")

# Modeling part

import time

from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV

In [9]:
bank_data = pd.read_csv('https://raw.githubusercontent.com/benvictoria17/MachineLearning/master/dataset/CompanyBankruptcy.csv')
bank_data.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [10]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

In [11]:
bank_data.describe()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
count,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,...,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0,6819.0
mean,0.032263,0.50518,0.558625,0.553589,0.607948,0.607929,0.998755,0.79719,0.809084,0.303623,...,0.80776,18629420.0,0.623915,0.607946,0.840402,0.280365,0.027541,0.565358,1.0,0.047578
std,0.17671,0.060686,0.06562,0.061595,0.016934,0.016916,0.01301,0.012869,0.013601,0.011163,...,0.040332,376450100.0,0.01229,0.016934,0.014523,0.014463,0.015668,0.013214,0.0,0.050014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.476527,0.535543,0.527277,0.600445,0.600434,0.998969,0.797386,0.809312,0.303466,...,0.79675,0.0009036205,0.623636,0.600443,0.840115,0.276944,0.026791,0.565158,1.0,0.024477
50%,0.0,0.502706,0.559802,0.552278,0.605997,0.605976,0.999022,0.797464,0.809375,0.303525,...,0.810619,0.002085213,0.623879,0.605998,0.841179,0.278778,0.026808,0.565252,1.0,0.033798
75%,0.0,0.535563,0.589157,0.584105,0.613914,0.613842,0.999095,0.797579,0.809469,0.303585,...,0.826455,0.005269777,0.624168,0.613913,0.842357,0.281449,0.026913,0.565725,1.0,0.052838
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,9820000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
[print(col) for col in bank_data if bank_data[col].isna().sum() > 0]

[]

In [13]:
bank_data.duplicated().sum()

0

In [14]:
print(bank_data['Bankrupt?'].value_counts())
print('-'* 30)
print('Financially stable: ', round(bank_data['Bankrupt?'].value_counts()[0]/len(bank_data) * 100,2), '% of the dataset')
print('Financially unstable: ', round(bank_data['Bankrupt?'].value_counts()[1]/len(bank_data) * 100,2), '% of the dataset')

0    6599
1     220
Name: Bankrupt?, dtype: int64
------------------------------
Financially stable:  96.77 % of the dataset
Financially unstable:  3.23 % of the dataset


In [16]:
X = bank_data.copy()
y = bank_data['Bankrupt?']

print('Financially stable: ', round(bank_data['Bankrupt?'].value_counts()[0]/len(bank_data) * 100,2), '% of the dataset')
print('Financially unstable: ', round(bank_data['Bankrupt?'].value_counts()[1]/len(bank_data) * 100,2), '% of the dataset')
print('-'*45)

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    #print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
    
# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 45)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

Financially stable:  96.77 % of the dataset
Financially unstable:  3.23 % of the dataset
---------------------------------------------
---------------------------------------------
Label Distributions: 

[0.96774194 0.03225806]
[0.96771827 0.03228173]


In [17]:
df = bank_data.sample(frac=1)

# amount of fraud classes 220 rows

unstable_df = df.loc[df['Bankrupt?'] == 1]
stable_df = df.loc[df['Bankrupt?'] == 0][:220]

normal_distributed_df = pd.concat([unstable_df, stable_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
74,0,0.525033,0.566398,0.576369,0.61933,0.61933,0.999066,0.797491,0.809392,0.303501,...,0.814383,0.001745,0.623694,0.619326,0.841555,0.279908,0.026945,0.565798,1,0.028754
1358,1,0.370497,0.410488,0.41635,0.595158,0.595353,0.998879,0.796695,0.808675,0.302498,...,0.712005,0.009892,0.623,0.595153,0.83136,0.282969,0.026709,0.564736,1,0.021912
3664,0,0.572564,0.622383,0.619359,0.620764,0.619582,0.999201,0.797637,0.809521,0.303471,...,0.844344,0.000724,0.624278,0.620762,0.84296,0.276761,0.026794,0.565171,1,0.056783
6631,0,0.486764,0.549989,0.537341,0.60697,0.60697,0.998969,0.797412,0.809334,0.303563,...,0.805756,0.000742,0.623835,0.606972,0.840899,0.280682,0.026961,0.565856,1,0.026363
2092,0,0.555989,0.591529,0.597088,0.648438,0.648438,0.999118,0.797568,0.809438,0.303525,...,0.825225,0.000286,0.623813,0.648433,0.841629,0.275886,0.026813,0.565262,1,0.090564


In [None]:

sns.set_context("paper")

plt.figure(figsize = (10,5))
sns.countplot(new_df['Bankrupt?'])
plt.title('Class Distributions \n (0: Fin. Stable || 1: Fin. Unstable)', fontsize=14)
plt.show()