In [61]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [62]:
df = pd.read_csv("sorted_correlation_with_values.csv").copy()

In [63]:
target = ['Bankrupt?']

In [64]:
features = list(df.columns)
features.remove('Bankrupt?')
features.remove('Unnamed: 0')
features

[' Net Income to Total Assets',
 ' ROA(A) before interest and % after tax',
 ' ROA(B) before interest and depreciation after tax',
 ' ROA(C) before interest and depreciation before interest',
 ' Net worth/Assets',
 ' Debt ratio %',
 ' Persistent EPS in the Last Four Seasons',
 ' Retained Earnings to Total Assets',
 ' Net profit before tax/Paid-in capital',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Current Liability to Assets',
 ' Working Capital to Total Assets',
 " Net Income to Stockholder's Equity",
 ' Borrowing dependency',
 ' Current Liability to Current Assets',
 ' Liability to Equity',
 ' Net Value Per Share (A)',
 ' Net Value Per Share (B)',
 ' Net Value Per Share (C)',
 ' Current Liability to Equity',
 ' Current Liabilities/Equity',
 ' Working Capital/Equity',
 ' Operating Profit Per Share (Yuan ¥)',
 ' Operating profit/Paid-in capital',
 ' Liability-Assets Flag',
 ' Total expense/Assets',
 ' Equity to Long-term Liability',
 ' CFO to Assets',
 ' Tax rate (A)',
 ' Cash/T

In [65]:
# Get the data
X = df[features]
y = df[target]

In [66]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 42)

In [67]:
# make an array of depths to choose from
depths = np.arange(1, 11)

# define the number to try out for max leaf nodes
num_leafs = list(range(5, 16))

# define the number to try out for min sample leaf
samples_leaf = list(range(1, 6))

try_grid = [{'max_depth':depths,
             'max_leaf_nodes':num_leafs,
             'min_samples_leaf':samples_leaf}]

In [68]:
# define your Model using GridSearchCV
DTM = GridSearchCV(DecisionTreeClassifier(), param_grid=try_grid, cv=10)
DTM.fit(X_train, y_train)

In [69]:
my_model = DecisionTreeClassifier(max_leaf_nodes = DTM.best_params_['max_leaf_nodes'] , max_depth = DTM.best_params_['max_depth'], min_samples_leaf = DTM.best_params_['min_samples_leaf'])

In [70]:
my_model.fit(X_train, y_train)

In [71]:
final_test = my_model.predict(X_test)

In [72]:
# evaluate final test result
print("Accuracy:",accuracy_score(y_test, final_test, normalize=True, sample_weight=None))
print("Confusion Matrix:", '\n', confusion_matrix(y_test, final_test))

Accuracy: 0.967375366568915
Confusion Matrix: 
 [[2624   13]
 [  76   15]]
