In [1]:
#Imports for artificial neural network
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree


np.random.seed(42)  #Seed is necessary to ensure consistent replication of results

df = pd.read_csv('data.csv')

In [2]:
#Now build decision tree classifiers
def data_info():
    df.info()
    null_counts = df.isnull().sum()
    null_columns = null_counts[null_counts > 0]
    return df.describe().transpose()

df.isnull().sum().sum()

Y=df["Bankrupt?"]
X=df.drop(["Bankrupt?"],axis=1)

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X,Y)

importance_features = rf.feature_importances_
important_features_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance_features})
important_features_df_desc =important_features_df.sort_values(by='Importance', ascending=False)
important_features_df_asc =important_features_df.sort_values(by='Importance', ascending=True)
important_features_df_desc.head(40)
required = important_features_df_desc[important_features_df_desc.Importance>0.0105]
required_features = df[required.Feature]  
correlation_matrix = df.corr()
target_corr = correlation_matrix["Bankrupt?"].abs().sort_values(ascending=False)
target_corr = target_corr.drop("Bankrupt?")
selected_features = target_corr[target_corr>=0.1]
selected_columns = list(selected_features.index)
df_selected = df[selected_columns]
actual_corr = correlation_matrix["Bankrupt?"][selected_features.index]
positive_corr = actual_corr[actual_corr >= 0].sort_values(ascending=False)
negative_corr = actual_corr[actual_corr < 0].sort_values(ascending=True)
sorted_correlations = pd.concat([positive_corr, negative_corr])

import seaborn as sns

selexted_X = df_selected
selected_y = df["Bankrupt?"]
X_train,X_test,y_train,y_test = train_test_split(selexted_X,selected_y,test_size = 0.2,random_state=42)

from sklearn.model_selection import GridSearchCV
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [10, 15, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(max_depth =5,min_samples_split = 20,random_state=42)
grid_search_dt = GridSearchCV(estimator=dt_classifier, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search_dt.fit(X_train,y_train)

grid_search_dt.best_estimator_

best_dt = grid_search_dt.best_estimator_

dt_train_pred = best_dt.predict(X_train)
dt_test_pred = best_dt.predict(X_test)

tree_info = best_dt.tree_
num_nodes = tree_info.node_count
num_leaves = tree_info.n_leaves
num_decision = num_nodes - num_leaves

rf_classifier = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf_classifier, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search_rf.fit(X_train,y_train)


grid_search_rf.best_estimator_

best_rf = grid_search_rf.best_estimator_

y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

#Classifier 1
classifier1 = pd.DataFrame(y_train_pred, index=X_train.index, columns=['Random Forest'])

#classifier 1a
classifier1a = pd.DataFrame(y_test_pred, index=X_test.index, columns=['Random Forest'])

Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits


In [3]:
#Additional imports for linear regression model
import statsmodels.api as sm
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

target = 'Bankrupt?'
predictors = [col for col in df.columns if col != target]

#forward selection
def forward_selection (df, target, predictors):
    features = []
    best_features = []
    while True: 
        features_left = list(set(predictors) - set(features))
        new_val = pd.Series(index = features_left)
        for new_col in tqdm(features_left, desc="Processing Columns"):
            model = sm.OLS(df[target], sm.add_constant(df[features + [new_col]])).fit()
            new_val[new_col] = model.pvalues[new_col]
        min_p_value = new_val.min()
        if min_p_value < 0.05:
            best_feature = new_val.idxmin()
            features.append(best_feature)
            best_features.append(best_feature)
        else:
            break
        return best_features

best_predictors = forward_selection(df, target, list(X_train.columns))

final_model = sm.OLS(y_train, sm.add_constant(X_train[best_predictors])).fit()

#find accuracy 
linRegPred = final_model.predict(sm.add_constant(X_train[best_predictors]))

#Classifier 2
classifier2 = pd.DataFrame(linRegPred, columns=['linear Regression'])

classifier2a = pd.DataFrame(final_model.predict(sm.add_constant(X_test[best_predictors])), columns=['linear Regression'])

assert classifier1.index.equals(classifier2.index), "Indices do not match!"

assert classifier1a.index.equals(classifier2a.index), "Indices do not match!"



  new_val = pd.Series(index = features_left)
Processing Columns: 100%|█████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 340.42it/s]


In [4]:
X_train.columns = X_train.columns.str.strip()
X_test.columns = X_test.columns.str.strip()

feature = 'Net Income to Total Assets'

x = X_train[[feature]]
xtest = X_test[[feature]]
y = y_train
ytest = y_test

x = sm.add_constant(x)
xtest = sm.add_constant(xtest)
final_model = sm.OLS(y, x).fit()

predictions = final_model.predict(x)

#Classifier 3
classifier3 = pd.DataFrame(predictions, columns=['linear Regression 2'])

classifier3a = pd.DataFrame(final_model.predict(xtest), columns=['linear Regression 2'])

assert classifier2.index.equals(classifier3.index), "Indices do not match!"

assert classifier2a.index.equals(classifier3a.index), "Indices do not match!"

In [5]:
#Now we get the classifier for logistic regression

In [6]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression(max_iter=1000, class_weight='balanced')
# Train the model
model.fit(X_train, y_train)

# Predict the target on the test set
y_pred = model.predict(X_test)

classifier4 = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=['Logistic Regression'])

classifier4a = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=['Logistic Regression'])

In [7]:
#Imports for support vector machine
from sklearn import svm
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from mpl_toolkits.mplot3d import Axes3D

scaler = StandardScaler()

df['Bankrupt?'] = pd.Categorical(df['Bankrupt?'])

dependentVariable = ['Bankrupt?']

featureGroupA = [
    'Tax rate (A)',
    'Net Value Per Share (B)',
    'Net Value Per Share (A)',
    'Net Value Per Share (C)',
    'Persistent EPS in the Last Four Seasons',
    'Operating Profit Per Share (Yuan ¥)',
    'Per Share Net profit before tax (Yuan ¥)',
    'Debt ratio %',
    'Operating profit/Paid-in capital',
    'Net profit before tax/Paid-in capital',
    'Quick Assets/Total Assets',
    'Cash/Total Assets',
    'Current Liability to Assets',
    'Total expense/Assets',
    'Equity to Long-term Liability',
    'Liability-Assets Flag',
    'Equity to Liability'
    ]

df.columns = df.columns.str.strip().str.lower()
independentVariablesA = [col.lower() for col in featureGroupA]

df['independentVariableA'] = df[independentVariablesA].mean(axis=1)

featureGroupB = [
    'Operating Gross Margin',
    'Realized Sales Gross Margin',
    'Net worth/Assets',
    'Working Capital to Total Assets',
    'Working Capital/Equity',
    'Retained Earnings to Total Assets',
    'Current Liability to Current Assets',
    'Net Income to Total Assets',
    'Gross Profit to Sales',
    'Net Income to Stockholder\'s Equity'
]

independentVariablesB = [col.lower() for col in featureGroupB]
df['independentVariableB'] = df[independentVariablesB].mean(axis=1)

featureGroupC = [
    'CFO to Assets',
    'Borrowing dependency',
    'Operating profit per person',
    'ROA(A) before interest and % after tax',
    'ROA(B) before interest and depreciation after tax',
    'ROA(C) before interest and depreciation before interest',
    'Current Liabilities/Equity',
    'Current Liability to Equity',
    'Liability to Equity'
]

independentVariablesC = [col.lower() for col in featureGroupC]
df['independentVariableC'] = df[independentVariablesC].mean(axis=1)

dependentVariable = [col.lower() for col in dependentVariable]

X = df[['independentVariableA', 'independentVariableB', 'independentVariableC']]

Y = df[dependentVariable]

Y = Y.values.ravel()

#In order to match instances with the currently in use X_train data, we grab the indices
trainIndices = X_train.index
#Same with the test data
testIndices = X_test.index
trainX = X.loc[trainIndices]
trainY = Y[trainIndices]
testX = X.drop(trainIndices)
testY = np.delete(Y, trainIndices)

polynomial_svm_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("svm_clf", svm.LinearSVC(C=100000, loss="squared_hinge", max_iter=10000))
])

polynomial_svm_clf.fit(trainX, trainY)

polynomial_svm_clf2 = Pipeline([
    ("poly_features", PolynomialFeatures(degree=15)),
    ("scaler", StandardScaler()),
    ("svm_clf", svm.LinearSVC(C=1000, loss="hinge", max_iter=1000))
])

polynomial_svm_clf2.fit(trainX, trainY)

print('Evaluation: ')
trainPredictions = polynomial_svm_clf.predict(trainX)
#Classifier 5
classifier5 = pd.DataFrame(trainPredictions, index=X_train.index, columns=['Support Vector Machine 1'])


testPredictions = polynomial_svm_clf.predict(testX)
classifier5a = pd.DataFrame(testPredictions, index=X_test.index, columns=['Support Vector Machine 1'])

trainPredictions2 = polynomial_svm_clf2.predict(trainX)
#Classifier 6
classifier6 = pd.DataFrame(trainPredictions2, index=X_train.index, columns=['Support Vector Machine 2'])

testPredictions2 = polynomial_svm_clf2.predict(testX)
classifier6a = pd.DataFrame(testPredictions2, index=X_test.index, columns=['Support Vector Machine 2'])




Evaluation: 




In [8]:
print('classifier1: ', classifier1)
print('classifier2: ', classifier2)
print('classifier3: ', classifier3)
print('classifier4: ', classifier4)
print('classifier5: ', classifier5)
print('classifier6: ', classifier6)

assert classifier3.index.equals(classifier2.index), "Indices do not match!"
assert classifier4.index.equals(classifier3.index), "Indices do not match!"
assert classifier5.index.equals(classifier4.index), "Indices do not match!"
assert classifier6.index.equals(classifier5.index), "Indices do not match!"

print('classifier1a: ', classifier1a)
print('classifier2a: ', classifier2a)
print('classifier3a: ', classifier3a)
print('classifier4a: ', classifier4a)
print('classifier5a: ', classifier5a)
print('classifier6a: ', classifier6a)

assert classifier3a.index.equals(classifier2a.index), "Indices do not match!"
assert classifier4a.index.equals(classifier3a.index), "Indices do not match!"
assert classifier5a.index.equals(classifier4a.index), "Indices do not match!"
assert classifier6a.index.equals(classifier5a.index), "Indices do not match!"

train_df = pd.concat([classifier1, classifier2, classifier3, classifier4, classifier5, classifier6], axis=1)
test_df = pd.concat([classifier1a, classifier2a, classifier3a, classifier4a, classifier5a, classifier6a], axis=1)

print(train_df.head())
print(test_df.head())

classifier1:        Random Forest
3759              0
1782              0
5013              0
5412              0
3066              0
...             ...
3772              0
5191              0
5226              0
5390              0
860               0

[5455 rows x 1 columns]
classifier2:        linear Regression
3759           0.039881
1782           0.027167
5013           0.028720
5412           0.028267
3066           0.035477
...                 ...
3772          -0.006723
5191          -0.016226
5226           0.024276
5390          -0.032160
860           -0.024261

[5455 rows x 1 columns]
classifier3:        linear Regression 2
3759             0.039881
1782             0.027167
5013             0.028720
5412             0.028267
3066             0.035477
...                   ...
3772            -0.006723
5191            -0.016226
5226             0.024276
5390            -0.032160
860             -0.024261

[5455 rows x 1 columns]
classifier4:        Logistic Regression
375