## Neural Network study of the Bankruptcy Modeling

Kudryavtsev O., Yazici M.

In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# load data
data = pd.read_csv('data.csv')
data

Unnamed: 0,X1,X2,X3,X4,X5,inactive_active
0,0.505452,0.037576,0.056000,1.424141,0.750397,0
1,0.523732,0.038025,0.072910,1.379251,0.849812,0
2,0.449485,0.023832,0.021829,1.696345,0.981098,0
3,0.370960,0.046000,0.070648,1.444869,0.984495,0
4,0.391547,-0.158851,0.029688,0.722070,1.179211,0
...,...,...,...,...,...,...
56410,0.582957,-0.019048,0.046115,0.215173,2.361905,0
56411,0.923624,-0.144440,-0.141252,11.325387,0.327095,1
56412,0.778861,-0.101432,-0.081823,5.433620,0.631230,1
56413,0.597173,0.020475,0.054032,2.228618,0.774742,1


In [3]:
count_1 = data['inactive_active'].sum()
count_0 = len(data['inactive_active']) - count_1
print('The count and ratio of 1 (active firms) in tha data')
print('count: ',count_1,'rate: ', count_1/(count_1+count_0), 'total data :', count_1+count_0)

The count and ratio of 1 (active firms) in tha data
count:  44205 rate:  0.7835681999468227 total data : 56415


In [4]:
print('The count and ratio of 0 (inactive firms) in tha data')
print('count: ',count_0,'rate: ', count_0/(count_1+count_0), 'total data :', count_1+count_0)

The count and ratio of 0 (inactive firms) in tha data
count:  12210 rate:  0.21643180005317736 total data : 56415


## PART 1: The Statistics

In [5]:
import scipy.stats as stats

# Convert data
X = data.drop(['inactive_active'],axis=1)
y = data['inactive_active']

# The one-way ANOVA tests: 
# The null hypothesis that two or more groups have the same population mean.
fvalue, pvalue = stats.f_oneway(X['X1'], X['X2'], X['X3'], X['X4'], X['X5'])
print(fvalue, pvalue)

if pvalue<0.05:
    print(
    "p-value: {}, The null hypothesis is rejected. There is a difference between at least two variables.".format(
        pvalue))
else:
    print(
    "p-value: {}, The null hypothesis is accepted. There are not any differences among the means of variables.".format(
        pvalue))

11.886063269889062 1.1766834907332443e-09
p-value: 1.1766834907332443e-09, The null hypothesis is rejected. There is a difference between at least two variables.


In [6]:
# // The Test of Normality //
# The creating a function called normality()
# The null hypothesis that the input data is not from a normal distribution.
def normality(x):
    k2, pvalue = stats.normaltest(x)
    alpha = 1e-3
    
    if pvalue < alpha: # null hypothesis: x comes from a normal distribution
        return('the input data is from a normal distribution')
    else:
        return('the input data is not from a normal distribution') 
    
print(normality(X['X1']))
print(normality(X['X2']))
print(normality(X['X3']))
print(normality(X['X4']))
print(normality(X['X5']))

the input data is from a normal distribution
the input data is from a normal distribution
the input data is from a normal distribution
the input data is from a normal distribution
the input data is from a normal distribution


In [7]:
from scipy.stats import levene

# // Test of Homogeneity of Variances // 
# The leneve's test is used instead of Bartlett’s test
# because our data is from a normal distribution.
# The null hypothesis that all input samples are from populations with equal variances.
stat, p = levene(X['X1'], X['X2'], X['X3'], X['X4'], X['X5'])

print(stat, p)

if pvalue<0.05:
    print(
    "p-value: {}, The null hypothesis is rejected. Not all input samples are from populations with equal variances.".format(
        pvalue))
else:
    print(
    "p-value: {}, The null hypothesis is accepted. All input samples are from populations with equal variances.".format(
        pvalue))

5.261200887484931 0.0003103579423549121
p-value: 1.1766834907332443e-09, The null hypothesis is rejected. Not all input samples are from populations with equal variances.


In [8]:
# Tamhane’s T2 all-pairs comparison test for normally distributed data with unequal variances. 
# Tamhane’s T2 test can be performed for all-pairs comparisons in an one-factorial layout with 
# normally distributed residuals but unequal groups variances. 
# A total of m = k(k-1)/2 hypotheses can be tested. 
# The null hypothesis is tested in the two-tailed test against the alternative hypothesis 

import scikit_posthocs as sp

x = pd.DataFrame({"X1": X['X1'], "X2": X['X2'], "X3": X['X3'], "X4": X['X4'],"X5": X['X5'], })
x = x.melt(var_name='groups', value_name='values')
sp.posthoc_tamhane(x, val_col='values', group_col='groups')

Unnamed: 0,X1,X2,X3,X4,X5
X1,1.0,0.101612,0.0,8.160139e-14,0.0
X2,0.1016125,1.0,0.565293,0.0001215299,0.05478299
X3,0.0,0.565293,1.0,0.0,0.0
X4,8.160139e-14,0.000122,0.0,1.0,4.746425e-11
X5,0.0,0.054783,0.0,4.746425e-11,1.0


## PART 2: The Neural Net training

In [9]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

#initial values for the index of splitted data
# n = X.shape[0] = 56415
# The train and test data parts of the data are as follows
# Train: X.loc[a:b] and Test: X.loc[c:d] 
# The lenght of the train data is 4 times of the test data.
n=56415
a=0; b=(4*n/5)-1; c=4*n/5 ; d=n-1

fold_no = 1
loss_per_fold, fn_per_fold, fp_per_fold = [], [], []
tn_per_fold, tp_per_fold, pr_per_fold, roc_per_fold = [], [], [], []

model= [0,1,2,3,4]; history= [0,1,2,3,4]
for i in range(5):
    for j in range(1,6):
        X_train, X_test, y_train, y_test = [], [], [], []
        X_train, X_test = X.loc[a:b], X.loc[c:d]
        y_train, y_test = y.loc[a:b], y.loc[c:d]
        
        count_tn_test, count_tp_test = 0, 0
        for i in y_test:
            if i==0:
                count_tn_test+=1
                
            if i==1:
                count_tp_test+=1
        
        print('The count of inactive firms in the test data:', count_tn_test)
        print('The count of active firms in the test data:', count_tp_test)

        # Analyze class imbalance in the targets
        # 0 and 1 mean inactive, active firms respectively.
        counts_1 = y_train.sum()
        counts_0 = len(y_train) - counts_1

        # The weighting for the imlanabce
        weight_for_0 = 1.0 / counts_0
        weight_for_1 = 1.0 / counts_1

        # Normalize the data using training set statistics
        mean = np.mean(X_train, axis=0)
        X_train -= mean
        X_test -= mean
        std = np.std(X_train, axis=0)
        X_train /= std
        X_test /= std

        # Build a binary classification model
        model[i] = Sequential()
        model[i].add(keras.Input(shape=(5,)))
        model[i].add(Dense(1, use_bias=True, activation="relu")) 
        #model[i].add(Dense(3, use_bias=True, activation="relu")) 
        model[i].add(Dense(1, activation="sigmoid")) 
        

        # define the keras model
        metrics = [
            keras.metrics.TrueNegatives(name="tn"),
            keras.metrics.TruePositives(name="tp"),
            #keras.metrics.FalseNegatives(name="fn"),
            #keras.metrics.FalsePositives(name="fp"),
            #keras.metrics.Precision(name="pr"),
            #keras.metrics.Recall(name="rc"),
            #keras.metrics.AUC(name="auc")
        ]

        # compile the keras model
        model[i].compile(
            optimizer=keras.optimizers.SGD(), 
            loss=keras.losses.BinaryCrossentropy(from_logits=True),
            metrics=metrics, run_eagerly=True
        )

        callbacks = [keras.callbacks.EarlyStopping(monitor='val_tn', mode='max', patience=50, restore_best_weights=True)]
        class_weight = {0: weight_for_0, 1: weight_for_1}

        # Generate a print
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        # fit the keras model on the dataset
        model[i].fit(X_train, y_train, epochs=10000, 
                   callbacks=[callbacks],validation_data=(X_test, y_test),
                      class_weight=class_weight)
        
        # Generate generalization metrics
        scores_test = model[i].evaluate(X_test, y_test, verbose=0)
        scores_train = model[i].evaluate(X_train, y_train, verbose=0)
        print(f'Score for fold {fold_no}: {model[i].metrics_names[0]} of {scores_test[0]}; {model[i].metrics_names[1]} of {scores_test[1]}; {model[i].metrics_names[2]} of {scores_test[2]}%')
        print(f'Score for fold {fold_no}: {model[i].metrics_names[0]} of {scores_train[0]}; {model[i].metrics_names[1]} of {scores_train[1]}; {model[i].metrics_names[2]} of {scores_train[2]}%')

        
        loss_per_fold.append(scores_test[0])
        tn_per_fold.append(scores_test[1])
        tp_per_fold.append(scores_test[2])
        loss_per_fold.append(scores_train[0])
        tn_per_fold.append(scores_train[1])
        tp_per_fold.append(scores_train[2])
        #auc_per_fold.append(scores[3])
        #fn_per_fold.append(scores[3])
        #fp_per_fold.append(scores[4])
        #pr_per_fold.append(scores[5])
        #roc_per_fold.append(scores[6])

        # Updating on border of parts of the data

        a+= n/5; b+= n/5; c+= n/5; d+= n/5 

        if a>=n:
            a=a-n
        if b>=n:
            b=b-n
        if c>=n:
            c=c-n
        if d>=n:
            d=d-n
        print('a=', a, 'b=',b,'c=',c,'d=',d)    
        
        # Increase fold number
        fold_no = fold_no + 1


The count of inactive firms in the test data: 2401
The count of active firms in the test data: 8882
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Score for fold 1: loss of 0.6956763863563538; t

Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Score for fold 2: loss of 0.7210500836372375; tn of 1137.0; tp of 4873.0%
Score for fold 2: loss of 0.7266331315040588; tn of 6031.0; tp of 16346.0%
a= 22566.0 b= 11282.0 c= 11283.0 d= 22565.0
The count of inactive firms in the test data: 2172
The count of active firms in the test data: 9111
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/10000


ValueError: Unexpected result of `train_function` (Empty logs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.

In [11]:
weights = []
for layer in model[0].layers:
    weights.append(layer.get_weights())
    
weights

[[array([[-0.49870253, -0.34028733, -0.27374184],
         [ 0.79215086, -0.431607  , -0.09532931],
         [ 1.4825865 ,  0.5765368 ,  0.24960464],
         [-0.4809401 ,  0.07675144, -2.1606798 ],
         [-0.46204314,  0.68306607,  0.5899975 ]], dtype=float32),
  array([-0.05728413, -1.0670689 , -0.27597493], dtype=float32)],
 [array([[ 0.5370146],
         [ 0.3890563],
         [-0.2800261]], dtype=float32),
  array([-0.16184498], dtype=float32)]]

In [12]:
weights = []
for layer in model[1].layers:
    weights.append(layer.get_weights())
    
weights


[[array([[-0.50863326,  0.05512453, -0.07973937],
         [-0.60945797, -0.63594097, -0.5271471 ],
         [ 1.5030128 ,  0.45033693,  0.29434103],
         [-0.08043813, -0.40192685, -0.32424206],
         [-0.46885785, -0.05854674,  0.15191257]], dtype=float32),
  array([-0.02235349, -0.12361064, -0.7375796 ], dtype=float32)],
 [array([[ 0.46709156],
         [-0.00100073],
         [ 0.04784242]], dtype=float32),
  array([-0.16520314], dtype=float32)]]

In [13]:
weights = []
for layer in model[2].layers:
    weights.append(layer.get_weights())
    
weights

[[array([[-3.6298841e-02, -3.1758866e-01, -7.2755534e-03],
         [-1.8320817e-01, -1.9358918e-01,  1.5903609e-03],
         [ 2.5006363e-01,  1.6751556e+00,  6.7165536e-01],
         [-4.0036520e-01, -3.1600782e-01, -1.1875194e-02],
         [-5.2829665e-01, -3.9048678e-01, -7.6732837e-02]], dtype=float32),
  array([-0.45049477, -0.00852251, -0.15321407], dtype=float32)],
 [array([[ 0.17760336],
         [ 0.5252656 ],
         [-0.67364   ]], dtype=float32),
  array([-0.13926245], dtype=float32)]]

In [15]:
weights = []
for layer in model[3].layers:
    weights.append(layer.get_weights())
    
weights

[[array([[ 0.1922297 , -0.65864605, -0.03421036],
         [-0.12225977,  1.1038945 ,  0.53465796],
         [ 0.16285181, -0.3169435 , -0.1571946 ],
         [-0.22447477, -0.24603991, -0.7175949 ],
         [-0.9024451 , -0.88144696, -0.6796323 ]], dtype=float32),
  array([-0.5493661 ,  0.12501346, -0.37942746], dtype=float32)],
 [array([[ 0.003272  ],
         [ 0.3544913 ],
         [-0.25608635]], dtype=float32),
  array([-0.22150855], dtype=float32)]]

In [16]:
weights = []
for layer in model[4].layers:
    weights.append(layer.get_weights())
    
weights

[[array([[ 0.4809681 , -0.02593469, -0.07389922],
         [ 0.08352622,  1.2002393 ,  0.23962002],
         [ 0.15392269,  0.00218992,  0.42190522],
         [-0.04256022,  0.48634708,  0.49245352],
         [-0.00445289, -0.12019757,  0.03835237]], dtype=float32),
  array([ 0.67079276, -0.26240855, -0.8451711 ], dtype=float32)],
 [array([[-0.29074916],
         [-0.38122767],
         [ 0.78662133]], dtype=float32),
  array([0.18905252], dtype=float32)]]