In [None]:
# Import Standard Libraries
import numpy as np
from numpy import linalg as LA
import os
import sys
import pandas as pd
import sqlite3
import math

# Import 3rd Party Libraries
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from scipy.io import arff


In [None]:
DatasetPath = os.getcwd() + '\\data\\1year.arff'
conn = sqlite3.connect(os.getcwd() + '/ResultsDB.db')
db = conn.cursor()
db.execute('''  CREATE TABLE IF NOT EXISTS Results (Id INTEGER PRIMARY KEY, 
                Type TEXT,
                TrainingSetPercentage REAL, 
                Epochs INTEGER, 
                LearningRate REAL,
                Precision REAL, 
                Recall REAL, 
                Accuracy REAL, 
                F1 REAL,
                TruePositive INTEGER,
                TrueNegative INTEGER,
                FalsePositive INTEGER,
                FalseNegative INTEGER
                )''')


def InsertResult(Type, TrainingSetPercentage, Epochs, LearningRate, Precision, Recall, Accuracy, F1, TruePositive, TrueNegative, FalsePositive, FalseNegative):
    db.execute(f"   INSERT INTO Results (Type, TrainingSetPercentage, Epochs, LearningRate, Precision, Recall, Accuracy, F1, TruePositive, TrueNegative, FalsePositive, FalseNegative) VALUES('{Type}', {TrainingSetPercentage}, {Epochs}, {LearningRate}, {Precision}, {Recall}, {Accuracy}, {F1}, {TruePositive}, {TrueNegative}, {FalsePositive},  {FalseNegative})")

    conn.commit()


In [None]:
"""
#Initialize 
pca = custom_PCA(dataframe, k)
#fit on the dataframe
pca.fit()

"""

# Custom PCA
class custom_PCA:
    def __init__(self, A, k,  is_normalized = True):
        self.A = A.iloc[:, :-1].to_numpy()
        self.k = k
        # No of rows
        self.N_rows = A.shape[0]
        # No of columns
        self.N_columns = A.shape[1]
        # Target class
        self.target_class = A.iloc[:, -1].to_numpy().reshape(self.N_rows,1).astype(int)
        self.is_normalized = is_normalized

    def norm(self, M= ''):
        if M=='' and self.is_normalized == False:
            return (self.A.T - self.A.T.mean(axis = 0))/(self.A.T.max(axis=0)- self.A.T.min(axis=0))
        elif M!='':
            return (M - M.mean(axis = 0))/(M.max(axis=0)- M.min(axis=0))
        else:
            return self.A.T
 
    
    # Calculate the covariance matrix
    def cov_(self):
        A = self.norm()
        return ((A).dot(A.T))/A.shape[0]

    def generate_reduced_eigenvectors(self):
        ''' Provide with the k-threshold and the eigenvalue/eigenvector tuple.
            Returns the new W matrix with the reduced eigenvectors.
        '''
        # Initialize
        partial_sum = 0
        idx = 0
        
        e_tuple = self.list_eigenvalues()
        
        # Sort e_tuple by the highest eigenvalue
        sort_eigen = sorted(e_tuple, key=lambda x: x[0], reverse = True)

        ## Define how many eigenvectors to keep
        # define the Sum of the eigenvalues
        sum_eig = sum([pair[0] for pair in sort_eigen])
        # Add eigenvectors as the k is smaller than the fraction
        for ii in range(len(sort_eigen)):
            if (partial_sum/sum_eig) <= self.k:
                partial_sum += sort_eigen[ii][0]
                # Index of Principal Components in the sort_eigen list
                idx+=1
            else:
                break

        print('Final selection is the first {} PCs'.format(idx))

        # Select eigenvalues, eigenvectors
        selected_eig = [sort_eigen[x][1] for x in range(idx) ]
        # reshape eigenvectors
        stack_eig = list(selected_eig[x].reshape(selected_eig[x].shape[0],1) for x in range(idx))
        # return W
        return np.hstack(stack_eig)
    
    def list_eigenvalues(self):
        e_values, e_vector  = LA.eig(self.cov_())
        # List of (eigenvalues, eigenvectors)
        e_tuple = [ (np.abs(e_values[i].real), e_vector[:,i].real) for i in range(len(e_values))]
        return e_tuple
    
    def fit(self):
        # Projected Data on the N Principal Components and Normalize
        # Should follow: (n,m) = (n,p) x (p, m)
        data_PC_projected = self.A.dot(self.generate_reduced_eigenvectors())
        #N Normalize
        data_PC_projected = (data_PC_projected - data_PC_projected.mean(axis = 0))/(data_PC_projected.max(axis=0)- data_PC_projected.min(axis=0))
        
        # Label the new dataset
        labeled_PC_data = np.concatenate((data_PC_projected, self.target_class), axis = 1)
        
        
        print('Dimensionaly reduced matrix shape', labeled_PC_data.shape )
        
        # Name new columns
        cols = ['PC_'+str(x+1) for x in range(data_PC_projected.shape[1])]
        cols.insert(len(cols), 'Target_Class')

        return pd.DataFrame(data = labeled_PC_data, columns = cols)

In [None]:
def entropy(x,y,string):
    val = -(x * np.log2(x) + y * np.log2(y))
    print('Entropy {0}: {1:9.3f} bits'.format(string, val))
    return val

def data_balancing(df):
    dftemp = df.where(nbcountq)[:bcount]
    dftempb = df.where(bcountq).dropna()
    df = pd.concat([dftemp, dftempb])
    df.reset_index(inplace=True)
    df = df.drop(columns=['index'],axis=1)
    return df

data,meta = arff.loadarff(DatasetPath)
df = pd.DataFrame(data)
df = df.fillna(0)
b0 = df != b'0'
b1 = df != b'1'
df = df.where(b0, int(0))
df = df.where(b1, int(1))
bcountq = df['class'] == 1
nbcountq = df['class'] == 0
bcount = df.where(bcountq).count()['class']
nbcount = df.where(nbcountq).count()['class']
# Balance of data (entropy)
entropy(bcount / df.count()['class'], nbcount / df.count()['class'], 'before data balancing')
df = data_balancing(df)
df

In [None]:
def meanNormalization(x):
    return (x-np.mean(x))/(np.max(x)-np.min(x))
# Normalize all the data (except for the class)
dfAdjusted = meanNormalization(df.drop(df.columns[64], axis=1))
# Add the label back to the dataset
dfAdjusted.insert(64, "Label", df.iloc[:, 64], True)
df = dfAdjusted
dfAdjusted.head()

In [None]:
def returnCorrelationCoeffs(X, labelIndex):
    corrList = np.array([])
    corrResult = np.array([])
    dfResults = np.array(X.iloc[:, labelIndex], dtype=np.int32)
    for b in range(X.shape[1]-1):
        print("{b}: {res}".format(b = b, res=round(np.corrcoef(X.iloc[:,b], dfResults)[0][1],2)))
        if round(np.corrcoef(X.iloc[:,b], dfResults)[0][1],2) > 0.2 or round(np.corrcoef(X.iloc[:,b], dfResults)[0][1],2) < -0.2:
            corrResult = np.append(corrResult, {b, np.corrcoef(X.iloc[:,b], dfResults)[0][1]})
        for a in range(b, X.shape[1]-1):
            if round(np.corrcoef(X.iloc[:,b],X.iloc[:,a])[0][1],2) > 0.45  and b != a:
                corrList = np.append(corrList, {b, a, round(np.corrcoef(X.iloc[:,b],X.iloc[:,a])[0][1],2)})
    return corrList, corrResult
# Correlation Coefficients of every feature to every feature and to the label.
# Also returns exact correlation values for features with high correlation to label separately.
resultC,resultR = returnCorrelationCoeffs(df, 64)

Correlation of all properties to label(bankruptcy in 5-x years)(Before data balancing (taking unequal amount of bankrupt and non-br companies))(1st year)
0: -0.01
1: 0.07
2: -0.07
3: 0.01
4: -0.0
5: -0.05
6: -0.01
7: -0.0
8: -0.01
9: -0.02
10: -0.01
11: -0.02
12: -0.0
13: -0.01
14: 0.01
15: -0.01
16: -0.0
17: -0.01
18: -0.0
19: -0.0
20: -0.0
21: -0.01
22: -0.0
23: -0.01
24: -0.02
25: -0.01
26: -0.01
27: -0.01
28: -0.04
29: -0.0
30: -0.0
31: 0.05
32: 0.01
33: 0.03
34: -0.01
35: -0.01
36: -0.0
37: -0.02
38: -0.0
39: 0.01
40: -0.01
41: -0.0
42: -0.0
43: -0.0
44: 0.0
45: 0.01
46: -0.01
47: -0.0
48: 0.0
49: 0.02
50: 0.06
51: 0.03
52: -0.01
53: -0.01
54: -0.02
55: 0.0
56: -0.06
57: -0.0
58: 0.0
59: -0.0
60: -0.0
61: -0.0
62: 0.01
63: -0.01

Correlation of all properties to label(bankruptcy in 5-x years)(After data balancing (taking equal amount of bankrupt and non-br companies))(1st year)
0: -0.03
1: 0.05
2: -0.05
3: 0.02
4: -0.05
5: -0.05
6: -0.08
7: 0.01
8: -0.04
9: -0.06
10: -0.14
11: -0.1
12: -0.07
13: -0.08
14: 0.05
15: -0.09
16: 0.01
17: -0.08
18: -0.07
19: -0.03
20: -0.21
21: -0.14
22: -0.07
23: -0.05
24: -0.06
25: -0.09
26: -0.04
27: -0.01
28: -0.16
29: 0.05
30: -0.07
31: 0.04
32: 0.04
33: 0.07
34: -0.15
35: -0.03
36: 0.02
37: -0.06
38: -0.11
39: 0.06
40: 0.02
41: -0.1
42: -0.04
43: -0.04
44: -0.01
45: 0.03
46: -0.09
47: 0.02
48: -0.07
49: 0.02
50: 0.05
51: 0.04
52: -0.05
53: -0.05
54: -0.14
55: -0.21
56: -0.04
57: 0.09
58: 0.03
59: -0.03
60: -0.04
61: 0.05
62: 0.02
63: -0.05

Features above .1 or below -.1 corr (year 1):
[{-0.1413018851459517, 10} {-0.20841261923141277, 20}
 {-0.1372111718368633, 21} {-0.15799640144955474, 28}
 {-0.15073406076182475, 34} {-0.10546165292651624, 38}
 {-0.13528288570222521, 54} {-0.21264496377398492, 55}]
 
Features above .1 or below -.1 corr (year 2):
[{0.10991412668598526, 14} {-0.12725072730586873, 21}
 {-0.12248897923901782, 28} {-0.1234228806033094, 34}
 {-0.19772762240757855, 38} {-0.17927408959921343, 41}
 {48, -0.13396938644807177} {-0.14547197542579082, 55}]

Features above .1 or below -.1 corr (year 3):
[{-0.1364854143630141, 21} {-0.1672251455335219, 28}
 {-0.22657462686080676, 34} {-0.19712648429452306, 38}
 {-0.10521445303302412, 47} {-0.1341474257923606, 55}
 {0.16542488035357647, 60}]
 
Features above .1 or below -.1 corr (year 4):
[{0, -0.14524070566749694} {0.14385170130264474, 1}
 {-0.2265714657775324, 2} {-0.1140296783075827, 5}
 {-0.1498606180797063, 6} {-0.14369125535185265, 9}
 {-0.12065561697059304, 10} {-0.1498606180797063, 13}
 {-0.1498606180797063, 17} {-0.11020892718297153, 21}
 {24, -0.15866649046031725} {-0.24862248191239458, 28}
 {-0.24684802136371506, 34} {0.10606657444051945, 35}
 {-0.13293591947714103, 37} {0.20849638235992823, 50}
 {0.10830222840664715, 51} {-0.12485768761118687, 54}]

Features above .1 or below -.1 corr (year 5):
[{0.12513067718459123, 1} {-0.12372472784830899, 2}
 {8, 0.1352011982500525} {-0.47574340253390146, 20}
 {-0.35757288140112825, 28} {-0.18541429475760515, 38}
 {0.11527405099147635, 50} {-0.1859186784182609, 54}
 {-0.17713206634969345, 55} {0.14115647965635486, 57}]

In [None]:
# .89 results in 2 features which is needed to graph it
pca = custom_PCA(df, 0.90)
pca_df = pca.fit()
# Custom built PCA implementation fitted to dataset
pca_df.head()

In [None]:
# Correlation of pca columns to label
returnCorrelationCoeffs(pca_df, pca_df.shape[1]-1)

In [None]:
# Bankrupt companies
pca_df_br = pca_df.where(pca_df['Target_Class'] == 1)

# Non-bankrupt companies
pca_df_non = pca_df.where(pca_df['Target_Class'] == 0)
def plotAllFeatures(in_df_a, in_df_b):
    for col in range(in_df_a.shape[1]-1):
        a1 = in_df_a.iloc[:,col]
        b1 = in_df_b.iloc[:,col]
        for col2 in range(col, in_df_a.shape[1]-1):
            if(col != col2):
                a2 = in_df_a.iloc[:,col2]
                b2 = in_df_b.iloc[:,col2]
                plt.plot(a1, a2, 'go')
                plt.plot(b1, b2, 'ro')
                plt.legend(['(X)Feature {F} - (Y) Feature {S}'.format(F = col,S = col2)])
                plt.show()
            
plotAllFeatures(pca_df_non, pca_df_br)

In [None]:
pca = PCA(n_components=13)
pca.fit(df.drop(df.columns[64], axis=1))
# Explained variance ratios of each feature
print(pca.explained_variance_ratio_)
pca_ds = pd.DataFrame(pca.transform(df.drop(df.columns[64], axis=1)))
pca_ds.insert(pca_ds.shape[1],"Target_Class", pca_df['Target_Class'], True)
# Dataset after applying PCA from Scikit-Learn
pca_ds

In [None]:
#Correlation values of the pca of Scikit-learn
returnCorrelationCoeffs(pca_ds, pca_ds.shape[1]-1)

In [None]:
plt.clf()

# Bankrupt companies
pca_ds_br = pca_ds.where(pca_ds['Target_Class'] == 1)

# Non-bankrupt companies
pca_ds_non = pca_ds.where(pca_ds['Target_Class'] == 0)

plotAllFeatures(pca_ds_non, pca_ds_br)

Neural Network


In [None]:
# Import the Standard Copy Library
import copy
import random
random.seed(20)

# Take a copy of the Dataset
Dataset =  pca_df.values


Setup Training and Testing Sets

In [None]:
# Setup Training Set
TrainingSetPercentage = 0.73
RowCount = len(Dataset)
TrainingRows = round(RowCount * TrainingSetPercentage)

# Variable Setup
datasetCopy = copy.deepcopy(Dataset) 
TrainingSet = []
TestingSet = []

# Setup the Training Set
for TrainingRowCounter in range(0, TrainingRows):
    SelectedRow_Random = random.randrange(0, len(datasetCopy))
    TrainingSet.append(datasetCopy[SelectedRow_Random])
    # Remove so we don't choose the same one again
    datasetCopy = np.delete(datasetCopy, SelectedRow_Random, axis=0)

# For each Remaining Row, setup the Testing Set
for x in range (len(datasetCopy)):
    TestingSet.append(datasetCopy[x])

Neural Network Class

In [None]:
# Define the Neural Network Class
class NeuralNetwork: 
    # Constructor
    def __init__(self, InputNeurons, HiddenLayerNeurons):
        # Setup Neuron Arrays
        self.InputNeurons = np.zeros(shape=(InputNeurons, 1))
        self.HiddenNeurons = np.random.rand(HiddenLayerNeurons, 1) # Holds the Bias
        self.OutputNeurons = np.random.rand(1, 1)
        # Setup Weight Arrays
        self.WeightsInputToHidden = np.random.rand(HiddenLayerNeurons, InputNeurons)
        self.WeightsHiddenToOutput = np.random.rand(1, HiddenLayerNeurons)

    # Set the Input and Output Neurons
    def SetInputs(self, InputArray):
        self.InputNeurons = InputArray
        
    # Sigmoid Function
    def SigmoidFunction(self, Val):
        return (1/(1 + np.exp(-Val)))

    def SquaredErrorFunction(self, Predicted, Expected):
        return 0.5 * (math.pow(Expected - Predicted), 2)

    # Train the Neural Netowrk Model
    def Train(self, Epochs, LearningRate, ActualOutput, Verbose = False):
        if(Verbose):
            print(f"Expected output is: {ActualOutput}")
        for epoch in range(0, Epochs):
            # Iterate over each Hidden Layer Neuron
            for NeuronIndex in range(0, len(self.HiddenNeurons)):
                # Neuron = Bias + (w1 * i1h1) + (w2*i2h1) + ..
                TempVal = 0
                for InputNeuronIndex in range(0, len(self.InputNeurons)):
                    TempVal += self.WeightsInputToHidden[NeuronIndex][InputNeuronIndex] * self.InputNeurons[InputNeuronIndex]
                TempVal += self.HiddenNeurons[NeuronIndex]
                self.HiddenNeurons[NeuronIndex] = self.SigmoidFunction(TempVal)

            # Iterate over each Output Layer Neuron
            for NeuronIndex in range(0, len(self.OutputNeurons)):
                # Neuron = Bias + (w1 * i1h1) + (w2*i2h1) + ..
                TempVal = 0
                for HiddenNeuronIndex in range(0, len(self.HiddenNeurons)):
                    TempVal += self.WeightsHiddenToOutput[NeuronIndex][HiddenNeuronIndex] * self.HiddenNeurons[HiddenNeuronIndex]
                TempVal += self.OutputNeurons[NeuronIndex]
                self.OutputNeurons[NeuronIndex] = self.SigmoidFunction(TempVal)

            # Here we apply Mean Squared Error Cost Function, but since formula is 1/2(P-A)^2, the derivative wouldbe (P-A)
            Error = self.OutputNeurons[0] - ActualOutput

            if(Verbose):
                print (f"Predicted output: {self.OutputNeurons[0][0]}, Expected: {ActualOutput}..  Epoch {epoch}, Error {Error}")

            UpdatedHiddenToOutputWeights = copy.deepcopy(self.WeightsHiddenToOutput)
            UpdatedInputToHiddenWeights = copy.deepcopy(self.WeightsInputToHidden)

            # Calculate the Updated Weights from Hidden Layer to Output Layer
            for HiddenNeuronIndex in range (0, len(self.HiddenNeurons)):
                UpdatedHiddenToOutputWeights[0][HiddenNeuronIndex] -= (LearningRate * ((Error) * (self.HiddenNeurons[HiddenNeuronIndex])))
            
            # Calculate the Updated Weights from Input Layer to Hidden Layer
            for HiddenNeuronIndex in range (0, len(self.HiddenNeurons)):
                for InputNeuronIndex in range (0, len(self.InputNeurons)):
                    UpdatedInputToHiddenWeights[HiddenNeuronIndex][InputNeuronIndex] -= (LearningRate * (self.InputNeurons[InputNeuronIndex] * ((Error) * self.WeightsHiddenToOutput[0][HiddenNeuronIndex])))
            
            # Update the weights
            self.WeightsInputToHidden = copy.deepcopy(UpdatedInputToHiddenWeights)
            self.WeightsHiddenToOutput = copy.deepcopy(UpdatedHiddenToOutputWeights)
            
            
    # Function to Predict on a trained NN
    def Predict(self, ExpectedOutput):
        # Do Forward Propagation on current weights
        # Iterate over each Hidden Layer Neuron
        for NeuronIndex in range(0, len(self.HiddenNeurons)):
            # Neuron = Bias + (w1 * i1h1) + (w2*i2h1) + ..
            TempVal = 0
            for InputNeuronIndex in range(0, len(self.InputNeurons)):
                TempVal += self.WeightsInputToHidden[NeuronIndex][InputNeuronIndex] * self.InputNeurons[InputNeuronIndex]
            TempVal += self.HiddenNeurons[NeuronIndex]
            self.HiddenNeurons[NeuronIndex] = self.SigmoidFunction(TempVal)

        # Iterate over each Output Layer Neuron
        for NeuronIndex in range(0, len(self.OutputNeurons)):
            # Neuron = Bias + (w1 * i1h1) + (w2*i2h1) + ..
            TempVal = 0
            for HiddenNeuronIndex in range(0, len(self.HiddenNeurons)):
                TempVal += self.WeightsHiddenToOutput[NeuronIndex][HiddenNeuronIndex] * self.HiddenNeurons[HiddenNeuronIndex]
            TempVal += self.OutputNeurons[NeuronIndex]
            self.OutputNeurons[NeuronIndex] = self.SigmoidFunction(TempVal)

        Prediction = round(self.OutputNeurons[0][0])
        print (f"Predicted: {Prediction}, Expected: {ExpectedOutput}")
        return Prediction
        

Do Processing and Print Metric Scores

In [None]:

num_rows, num_cols = Dataset.shape
Epochs = 15000
LearningRate = 0.01

TruePositive = 0
FalsePositive = 0
FalseNegative = 0
TrueNegative = 0

In [None]:
# Scikit Learn Neural Network
SciKitTrainingSet = copy.deepcopy(TrainingSet)
SciKitTrainingSetClassifications = copy.deepcopy(TrainingSet)

SciKitTrainingSet = np.delete(SciKitTrainingSet, -1, axis=1)
SciKitTrainingSetClassifications = np.ravel(np.delete(SciKitTrainingSetClassifications, np.s_[0:num_cols-1] , axis=1))

SciKitTestingSet = copy.deepcopy(TestingSet)
SciKitTestingSetClassifications = copy.deepcopy(TestingSet)

SciKitTestingSet = np.delete(SciKitTestingSet, -1, axis=1)
SciKitTestingSetClassifications = np.ravel(np.delete(SciKitTestingSetClassifications, np.s_[0:num_cols-1], axis=1))

SciKitNN = MLPClassifier(solver="sgd", activation="logistic", hidden_layer_sizes=(1, math.ceil((num_cols+1)/2)), learning_rate="constant", learning_rate_init=LearningRate, max_iter=Epochs, random_state=1)
SciKitNN.fit(SciKitTrainingSet, SciKitTrainingSetClassifications)
SciKitNNPrediction = SciKitNN.predict(SciKitTestingSet)
SciKitNNScore = accuracy_score(SciKitTestingSetClassifications, SciKitNNPrediction)
print(f"SciKit Accuracy: {SciKitNNScore}")

In [None]:
# Neural Network Constructor Parameters: InputLayer Neurons, HiddenLayer Neurons.. Output here is always 1 Neuron 1 or 0
NN = NeuralNetwork(num_cols-1, math.ceil((num_cols+1)/2))
print(f"Starting Training Epochs: {Epochs}, LearningRate: {LearningRate}")
for Index, TrSetRow in enumerate(TrainingSet):
    print(f"Training: {Index+1} of {len(TrainingSet)}")
    NN.SetInputs(TrSetRow[0:num_cols-1])
    # Parameters for NN Train: Epochs (iterations for training), Learning Rate, Actual Output
    NN.Train(Epochs, LearningRate, TrSetRow[num_cols-1], Verbose=False)

print("Starting Testing")

for TestSetRow in TestingSet:
    NN.SetInputs(TestSetRow[0:num_cols-1])
    # Parameters for NN Train: Epochs (iterations for training), Learning Rate, Actual Output
    Prediction = NN.Predict(TestSetRow[num_cols-1])

    # Update Metrics
    if(TestSetRow[num_cols-1] == 1 and Prediction == 1):
        TruePositive += 1
    elif (TestSetRow[num_cols-1] == 1 and Prediction == 0):
        FalseNegative += 1
    elif (TestSetRow[num_cols-1] == 0 and Prediction == 1):
        FalsePositive += 1
    elif (TestSetRow[num_cols-1] == 0 and Prediction == 0):
        TrueNegative += 1

# Precision: TP/ (TP+FP)
Precision = TruePositive / (TruePositive + FalsePositive)
# Recall: TP / (TP+FN)
Recall = TruePositive / (TruePositive + FalseNegative)
# Accuracy: (TP + TN)/(TP + TN + FP + FN)
Accuracy = (TruePositive + TrueNegative) / (TruePositive + TrueNegative + FalsePositive + FalseNegative)
# F1 Score: (2 * Precision * Recall) / (Precision + Recall)
F1 = (2 * Precision * Recall)/(Precision + Recall)
print(f"True Positives: {TruePositive}")
print(f"True Negatives: {TrueNegative}")
print(f"False Positives: {FalsePositive}")
print(f"False Negatives: {FalseNegative}")

print(f"Precision {Precision}")
print(f"Recall {Recall}")
print(f"Accuracy {Accuracy}")
print(f"F1 Score {F1}")

# Insert Results to DB for Plotting
InsertResult("Neural Network", TrainingSetPercentage, Epochs, LearningRate, Precision, Recall, Accuracy, F1, TruePositive, TrueNegative, FalsePositive, FalseNegative)


In [None]:
# Visualize Confusion Matrix
def confusion_matrix_visualization(confusion_matrix):
    fig, ax = plt.subplots()
    im = ax.imshow(confusion_matrix, cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(confusion_matrix.shape[1]),
           yticks=np.arange(confusion_matrix.shape[0]),
           xticklabels=['Bankrupted', 'Saved'],  
           yticklabels=['GT_Bankrupted', 'GT_Saved'],
           title = 'Confusion Matrix' )
    for i in range(confusion_matrix.shape[0]):
        for j in range(confusion_matrix.shape[1]):
            ax.text(j, i, confusion_matrix[i, j], size = 18, horizontalalignment='center', verticalalignment='center')


# Confusion Matrix
confusion_matrix = np.asarray([[TruePositive, FalsePositive], [FalseNegative,TrueNegative]])
confusion_matrix_visualization(confusion_matrix = confusion_matrix)


## Train Logistic Regression Model

In [None]:
# Seperate labels from variables
X_var = np.asarray(TrainingSet)[:, :-1]
Y_var = np.asarray(TrainingSet)[:, -1]

# Put the Test set in a df
TestData_df = pd.DataFrame(data = TestingSet, columns = pca_df.columns)
TestData_df.head()

In [None]:
# Initialize parameters
a = 0.1
# Iterations number
no_of_iter = 3000000

# Initialize Weights
W = np.random.uniform(low=-0.1, high=0.1, size=(X_var.shape[1]))

N = X_var.shape[0]

x_w=X_var.dot(W)

In [None]:
# Sigmoid Function
def Sigmoid_F(x_var, w):
    x_w = np.dot(x_var, w)
    return 1 / (1 + np.exp(-x_w))

def Loss(Y_var, sig_Y):
    return (-(Y_var*np.log(sig_Y)) -  ((1 - Y_var) * np.log(1 - sig_Y))).mean()

def Loss_derivative(sig_y, truth_y, x_matrix, N=N):
    dY =  sig_y - truth_y 
    return np.dot(dY, x_matrix)/N


In [None]:
# Gradient Descent
for ii in range(no_of_iter):   
    # Sigmoid function output
    sig_Y = Sigmoid_F(X_var, W)
    # Loss Function
    L = Loss(Y_var, sig_Y)
    # Gradient computation
    dw = Loss_derivative(sig_y = sig_Y, truth_y = Y_var, x_matrix = X_var)
    # Update weights
    W = W - a * dw

    
print('Final Loss: ', L)

## Custom LR model prediction

In [None]:
predictions =pd.DataFrame(columns = ['Index', 'bankruptcy', 'Ground_Truth'])

# Loop over the test set
for index, row in TestData_df.iloc[:, :-1].iterrows():

    prob = Sigmoid_F(row.tolist(), W)    
    if prob> 0.5:
        predictions = predictions.append({'Index':index, 'bankruptcy': 1, 'Ground_Truth': TestData_df.iloc[index, -1]  }, ignore_index=True )
    elif prob <= 0.5:
        predictions = predictions.append({'Index':index, 'bankruptcy': 0, 'Ground_Truth': TestData_df.iloc[index, -1]  }, ignore_index=True )
        

# Metrics
TP = predictions.loc[(predictions.bankruptcy == 1) & (predictions.Ground_Truth == 1)].shape[0]
TN = predictions.loc[(predictions.bankruptcy == 0) & (predictions.Ground_Truth == 0)].shape[0]
FP = predictions.loc[(predictions.bankruptcy == 1) & (predictions.Ground_Truth == 0)].shape[0]
FN = predictions.loc[(predictions.bankruptcy == 0) & (predictions.Ground_Truth == 1)].shape[0]

# Confusion Matrix
confusion_matrix = np.asarray([[TP, FP], [FN,TN]])

metrics_string = 'True Positives {}  True Negatives {}  False Positives {}  False Negatives {}'
print(metrics_string.format(TP, TN, FP, FN))
 

In [None]:
# Visualize Confusion Matrix
def confusion_matrix_visualization(confusion_matrix):
    fig, ax = plt.subplots()
    im = ax.imshow(confusion_matrix, cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(confusion_matrix.shape[1]),
           yticks=np.arange(confusion_matrix.shape[0]),
           xticklabels=['Bankrupted', 'Saved'],  
           yticklabels=['GT_Bankrupted', 'GT_Saved'],
           title = 'Confusion Matrix' )
    for i in range(confusion_matrix.shape[0]):
        for j in range(confusion_matrix.shape[1]):
            ax.text(j, i, confusion_matrix[i, j], size = 18, horizontalalignment='center', verticalalignment='center')



In [None]:
# Visualize confusion matrix
confusion_matrix_visualization(confusion_matrix = confusion_matrix)

# Precision
Precision = TP/(TP + FP)
# Recall
Recall = TP/(TP + FN)
# Accuracy
Accuracy = (TP+TN)/(TP+TN+FP+FN)
#F1 Score
F_1 = 2*(Precision*Recall)/(Precision+Recall)

print('Precision {} \nRecall {} \nAccuracy {} \nF1 score {}'.format(Precision, Recall, Accuracy,  F_1 ))
InsertResult("Logistic Regression", TrainingSetPercentage, no_of_iter, a, Precision, Recall, Accuracy, F_1, TP, TN, FP, FN)

# Sklearn LR model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

no_of_iter = 1000000
a = 0
logisticRegr = LogisticRegression(max_iter = no_of_iter)
logisticRegr.fit(X_var, Y_var)

In [None]:
# Split the Target class from the 
Y_test = np.array(TestingSet)[:, -1]
X_test = np.array(TestingSet)[:, :-1]

In [None]:
predictions = logisticRegr.predict(X_test)

metrics:

In [None]:
skl_precision_score = precision_score(Y_test, predictions )
skl_recall_score = recall_score(Y_test, predictions )
skl_accuracy_score = accuracy_score(Y_test, predictions )
skl_f1_score = f1_score(Y_test, predictions )

print(type(skl_f1_score))
TN, FP, FN, TP = confusion_matrix(Y_test.tolist(), predictions.tolist()).ravel()

print("Precision {} \nRecall {} \nAccuracy {} \nF1 score {}".format(float(skl_precision_score), float(skl_recall_score), float(skl_accuracy_score), float(skl_f1_score)))
InsertResult("SkLearn_Logistic_Regression", TrainingSetPercentage, no_of_iter, a, skl_precision_score, skl_recall_score, skl_accuracy_score, skl_f1_score, TP, TN, FP, FN)

## Visualize results

In [None]:
# read the results csv, obtained from db
results = pd.read_csv(os.getcwd() + '\\Results.csv')

In [None]:
cols = ['Epochs', 'Precision', 'Recall', 'Accuracy', 'F1', ]

# precision
for iter in range (0, 2):
    if (iter == 0):
        Pr_a1 = results[cols].loc[(results.LearningRate==0.1) & (results.Type == 'Logistic Regression')]
        Pr_a2 = results[cols].loc[(results.LearningRate==0.01) & (results.Type == 'Logistic Regression')]
        Pr_a3 = results[cols].loc[(results.LearningRate==0.001) & (results.Type == 'Logistic Regression')]
        
    elif (iter == 1):
        Pr_a1 = results[cols].loc[(results.LearningRate==0.1) & (results.Type == 'Neural Network')]
        Pr_a2 = results[cols].loc[(results.LearningRate==0.01) & (results.Type == 'Neural Network')]
        Pr_a3 = results[cols].loc[(results.LearningRate==0.001) & (results.Type == 'Neural Network')]

    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize= (10,10))#, sharey = 'row')

    #Precision
    ax1.plot(Pr_a1.Epochs, Pr_a1.Precision, 'o--')
    ax1.plot(Pr_a2.Epochs, Pr_a2.Precision, 'x--')
    ax1.plot(Pr_a3.Epochs, Pr_a3.Precision, '^--')
    ax1.set_xlabel('Training Iterations')
    ax1.set_ylabel('Precision %')
    ax1.legend(['a1=0.1', 'a2=0.01', 'a3=0.001'], loc='upper right')
    plt.tight_layout()

    # Recall
    ax2.plot(Pr_a1.Epochs, Pr_a1.Recall, 'o--')
    ax2.plot(Pr_a2.Epochs, Pr_a2.Recall, 'x--')
    ax2.plot(Pr_a3.Epochs, Pr_a3.Recall, '^--')
    ax2.set_xlabel('Training Iterations')
    ax2.set_ylabel('Recall %')
    ax2.legend(['a1=0.1', 'a2=0.01', 'a3=0.001'], loc='upper right')
    plt.tight_layout()

    # Accuracy
    ax3.plot(Pr_a1.Epochs, Pr_a1.Accuracy, 'o--')
    ax3.plot(Pr_a2.Epochs, Pr_a2.Accuracy, 'x--')
    ax3.plot(Pr_a3.Epochs, Pr_a3.Accuracy, '^--')
    ax3.set_xlabel('Training Iterations')
    ax3.set_ylabel('Accuracy %')
    ax3.legend(['a1=0.1', 'a2=0.01', 'a3=0.001'], loc='upper right')
    plt.tight_layout()


    # F1
    ax4.plot(Pr_a1.Epochs, Pr_a1.F1, 'o--')
    ax4.plot(Pr_a2.Epochs, Pr_a2.F1, 'x--')
    ax4.plot(Pr_a3.Epochs, Pr_a3.F1, '^--')
    ax4.set_xlabel('Training Iterations')
    ax4.set_ylabel('F1 score %')
    ax4.legend(['a1=0.1', 'a2=0.01', 'a3=0.001'], loc='upper right')
    plt.tight_layout()

    plt.show()


Sklearn Logistic Regression

In [None]:
# Sklearn results
Pr_skl = results[cols].loc[results.Type == 'SkLearn_Logistic_Regression']
Pr_skl.head()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize= (10,10))#, sharey = 'row')

#Precision
ax1.plot(Pr_skl.Epochs, Pr_skl.Precision, 'o--')
ax1.set_xlabel('Training Iterations')
ax1.set_ylabel('Precision %')
plt.tight_layout()

# Recall
ax2.plot(Pr_skl.Epochs, Pr_skl.Recall, 'o--')
ax2.set_xlabel('Training Iterations')
ax2.set_ylabel('Recall %')
plt.tight_layout()

# Accuracy
ax3.plot(Pr_skl.Epochs, Pr_skl.Accuracy, 'o--')
ax3.set_xlabel('Training Iterations')
ax3.set_ylabel('Accuracy %')
plt.tight_layout()

# F1
ax4.plot(Pr_skl.Epochs, Pr_skl.F1, 'o--')
ax4.set_xlabel('Training Iterations')
ax4.set_ylabel('F1 score %')
plt.tight_layout()

plt.show()