In [1]:
# Import libraries
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Prepare Dataset

In [2]:
# Import and clean dataset

df = pd.read_csv(Path("../../Resources/relabeled_data.csv"))
df.dropna(inplace=True)
df

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses,status
0,511.267,833.107,18.373,89.031,336.018,35.163,128.348,372.7519,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302,0
1,485.856,713.811,18.577,64.367,320.590,18.531,115.187,377.1180,874.255,701.854,179.987,45.790,160.444,125.392,204.065,874.255,361.642,809.888,0
2,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,364.5928,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514,0
3,396.412,496.747,27.172,30.745,259.954,-12.410,66.322,143.3295,606.337,686.621,164.658,3.573,109.590,203.575,124.106,606.337,391.633,575.592,0
4,432.204,523.302,26.680,47.491,247.245,3.504,104.661,308.9071,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78677,233.211,43.338,14.094,45.615,3.376,25.261,22.846,756.4827,104.223,1099.101,184.666,31.521,60.885,28.197,28.095,104.223,225.887,58.608,0
78678,105.559,59.184,42.592,202.133,2.288,129.688,54.611,527.5750,291.153,1865.926,770.103,159.541,231.969,88.128,157.783,291.153,880.327,89.020,0
78679,63.971,69.074,65.057,79.051,2.581,-1.442,42.467,578.8868,169.858,1746.235,683.985,13.994,100.784,85.765,156.341,169.858,770.233,90.807,0
78680,135.207,66.527,65.330,69.171,2.013,-20.401,27.217,412.6241,161.884,1736.110,694.035,3.841,95.357,82.010,135.941,161.884,776.697,92.713,0


In [3]:
# Set X and y

X = df.drop(columns='status')
y = df['status']
display(X.tail(3))
display(y[-3:])

Unnamed: 0,current_assets,cost_of_goods_sold,depreciation_and_amortization,EBITDA,inventory,net_income,total_receivables,market_value,net_sales,total_assets,total_long_term_debt,EBIT,gross_profit,total_current_liabilities,retained_earnings,total_revenue,total_liabilities,total_operationg_expenses
78679,63.971,69.074,65.057,79.051,2.581,-1.442,42.467,578.8868,169.858,1746.235,683.985,13.994,100.784,85.765,156.341,169.858,770.233,90.807
78680,135.207,66.527,65.33,69.171,2.013,-20.401,27.217,412.6241,161.884,1736.11,694.035,3.841,95.357,82.01,135.941,161.884,776.697,92.713
78681,82.589,68.817,65.201,67.262,2.112,-50.946,45.839,354.1786,160.513,1625.37,632.122,2.061,91.696,79.365,84.995,160.513,712.687,93.251


78679    0
78680    0
78681    0
Name: status, dtype: int64

In [4]:
# Scale X

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Resampling
sampler = SMOTE(random_state=1)
X_resample, y_resample = sampler.fit_resample(X_scaled, y)
display(X_resample.shape)
display(y_resample.shape)

(146924, 18)

(146924,)

# Set instances

In [6]:
### SKLearn MLP Classifier ###
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()
# first hidden layer = 100
# activation = 'relu'
# optimizer = 'adam'
# epochs = 200

### SKLearn SVC: kernel = rbf ###
from sklearn.svm import SVC
svc_model = SVC(kernel='rbf')


### Tensorflow Keras Sequential ###
nn_model_1 = tf.keras.models.load_model(Path("../../Resources/neural_network_models/four_layers.h5"))
# first hidden layer = 16
# second hidden layer = 8
# third hidden layer = 4
# fourth hidden layer = 2
# hidden layer activation = 'selu'
# output layer activation = 'softplus'
# optimizer = 'nadam'
# epochs = 100

nn_model_2 = tf.keras.models.load_model(Path("../../Resources/neural_network_models/96nodes_4layers_32epochs.h5"))
# first hidden layer = 96
# second hidden layer = 48
# third hidden layer = 24
# fourth hidden layer = 12
# hidden layer activation = 'selu'
# output layer activation = 'softplus'
# optimizer = 'nadam'
# epochs = 32

### ImbLearn Easy Ensemble Classifier ###
from imblearn.ensemble import EasyEnsembleClassifier
eec_model = EasyEnsembleClassifier()

### ImbLearn RUS Boost Classifier ###
from imblearn.ensemble import RUSBoostClassifier
rbc_model = RUSBoostClassifier()

### ImbLearn Balanced Baggig Classifier ###
from imblearn.ensemble import BalancedBaggingClassifier
bbc_model = BalancedBaggingClassifier()

# kFold cross validation

In [7]:
# Create ordered dictionary that stores model instances
from collections import OrderedDict

# X_resample, y_resample
sklearn_dictionary = OrderedDict()
sklearn_dictionary['MLPClassifier'] = mlp_model
sklearn_dictionary['SVC kernel rbf'] = svc_model

# X_resample, y_resample
nn_dictionary = OrderedDict()
nn_dictionary['Neural Network 1'] = nn_model_1
nn_dictionary['Neural Network 2'] = nn_model_2

# X_scaled, y
imblearn_dictionary = OrderedDict()
imblearn_dictionary['Easy Ensemble Classifier'] = eec_model
imblearn_dictionary['RUS Boost Classifier'] = rbc_model
imblearn_dictionary['Balanced Bagging Classifier'] = bbc_model

### Define function to perform KFolds

In [61]:
def apply_prediction(model, X_train, X_test, y_train):

    model.fit(X_train, y_train)
    prediction_array = model.predict(X_test)
    
    return prediction_array

In [89]:
def perform_kfold_split(model, X, y):

    kfold = KFold(n_splits=5)

    for train_index, test_index in kfold.split(X):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        prediction =  apply_prediction(model, X_train, X_test, y_train)
        return y_test, prediction


In [90]:
def combine_kfold_predictions(models_dict, X, y):

    counter = 0
    prediction_dict = {}

    for key, value in models_dict.items():

        kfold = KFold(n_splits=5)

        for train_index, test_index in kfold.split(X):
            
            counter += 1

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            prediction =  apply_prediction(value, X_train, X_test, y_train)
            
            df = pd.concat([pd.Series(y_test), pd.Series(prediction)], axis=1)
            df.columns = ['test', 'pred']

            prediction_dict[f"{key}_iter{counter}"] = df

        return prediction_dict

In [91]:
# Fast performing machine learning dict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
rf_classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy')

fast_performing_ml_dict = OrderedDict()
fast_performing_ml_dict['Logistic Regression'] = lr
fast_performing_ml_dict['Random Forest Classifier'] = rf_classifier

In [92]:
a = combine_kfold_predictions(fast_performing_ml_dict, X_resample, y_resample)

In [93]:
a

{'Logistic Regression_iter1':        test  pred
 0         0     1
 1         0     1
 2         0     1
 3         0     1
 4         0     1
 ...     ...   ...
 29380     0     1
 29381     0     1
 29382     0     1
 29383     0     1
 29384     0     1
 
 [29385 rows x 2 columns],
 'Logistic Regression_iter2':        test  pred
 29385   0.0   NaN
 29386   0.0   NaN
 29387   0.0   NaN
 29388   0.0   NaN
 29389   0.0   NaN
 ...     ...   ...
 29380   NaN   1.0
 29381   NaN   1.0
 29382   NaN   1.0
 29383   NaN   0.0
 29384   NaN   0.0
 
 [58770 rows x 2 columns],
 'Logistic Regression_iter3':        test  pred
 58770   0.0   NaN
 58771   0.0   NaN
 58772   0.0   NaN
 58773   0.0   NaN
 58774   0.0   NaN
 ...     ...   ...
 29380   NaN   1.0
 29381   NaN   1.0
 29382   NaN   0.0
 29383   NaN   1.0
 29384   NaN   1.0
 
 [58770 rows x 2 columns],
 'Logistic Regression_iter4':        test  pred
 88155   1.0   NaN
 88156   1.0   NaN
 88157   1.0   NaN
 88158   1.0   NaN
 88159   1.0   NaN