# Model 1: Stacking Method

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

clustered_data_path = "data/clustered_data.csv"
clustered_data = pd.read_csv(clustered_data_path)

X_cluster = clustered_data.drop(['Bankrupt?', 'Cluster_ID'], axis=1)
y_cluster = clustered_data['Cluster_ID']

X_train, X_test, y_train, y_test = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)

# train
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=99)
rf_classifier.fit(X_train, y_train)

# predict
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Classifier acuracy: {accuracy}")

feature_importances = rf_classifier.feature_importances_
important_features = X_cluster.columns[np.argsort(feature_importances)[::-1]]
print("Important Features:")
print(important_features)

# estimated
estimated_bankrupt_count = np.sum(y_pred == 1)
estimated_non_bankrupt_count = np.sum(y_pred == 0)
# actual
actual_bankrupt_count = np.sum(y_test == 1)
actual_non_bankrupt_count = np.sum(y_test == 0)
print(f"num estimated bankrupt: {estimated_bankrupt_count}")
print(f"num estimated non-bankrupt: {estimated_non_bankrupt_count}")
print(f"num actual bankrupt: {actual_bankrupt_count}")
print(f"num actual non-bankrupt: {actual_non_bankrupt_count}")

accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy}")
TT = np.sum((y_test == 1) & (y_pred == 1))
FT = np.sum((y_test == 0) & (y_pred == 1))
TF = np.sum((y_test == 1) & (y_pred == 0))
FF = np.sum((y_test == 0) & (y_pred == 0))
print(f"TT: {TT}")
print(f"FT: {FT}")
print(f"TF: {TF}")
print(f"FF: {FF}")

Random Forest Classifier acuracy: 0.9070567986230637
Important Features:
Index([' Current Liability to Assets', ' Equity to Liability',
       ' Persistent EPS in the Last Four Seasons',
       ' Net profit before tax/Paid-in capital',
       ' Per Share Net profit before tax (Yuan ¥)', ' Debt ratio %',
       ' Net Income to Total Assets', ' Net worth/Assets',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Operating profit/Paid-in capital',
       ' ROA(A) before interest and % after tax', ' Cash/Total Assets',
       ' Quick Assets/Total Assets', ' Cash/Current Liability',
       ' Total Asset Turnover',
       ' ROA(B) before interest and depreciation after tax',
       ' ROA(C) before interest and depreciation before interest',
       ' Fixed Assets Turnover Frequency', ' Working Capital to Total Assets',
       ' Cash Flow to Liability', ' Net Value Per Share (B)',
       ' Net Value Per Share (A)', ' Cash Turnover Rate',
       ' Borrowing dependency', ' Net Value Per Sh

| data set | FF(FT) | TT(TF) | accuracy | accuracy score |
| -------- | -------| ------ | -------- | -------------- |
| train    | 0(0)   | 0(0)   | 0.00     | 0.00           |
| test     | 0(0)   | 0(0)   | 0.00     | 0.00           |

In [2]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data_path = 'data/filtered_train_data.csv'
data = pd.read_csv(data_path)
print(f"original num features: {data.shape[1] - 1}")
selected_features = [' Current Liability to Assets',
       ' Persistent EPS in the Last Four Seasons',
       ' Per Share Net profit before tax (Yuan ¥)', ' Debt ratio %',
       ' Net Income to Total Assets', ' Net worth/Assets',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Operating profit/Paid-in capital',
       ' Cash/Total Assets',
       ' Quick Assets/Total Assets', ' Cash/Current Liability',
       ' Total Asset Turnover',
       ' ROA(C) before interest and depreciation before interest',
       ' Fixed Assets Turnover Frequency', ' Working Capital to Total Assets',
       ' Cash Flow to Liability', ' Net Value Per Share (B)',
       ' Net Value Per Share (A)',
       ' Borrowing dependency', ' Net Value Per Share (C)',
       ' Cash Flow to Total Assets', ' Operating profit per person',
       ' Tax rate (A)',' Total expense/Assets',
       ' CFO to Assets', ' Long-term fund suitability ratio (A)',
       ' Revenue per person',
       ' Accounts Receivable Turnover']

"""
selected_features = [' Current Liability to Assets', ' Equity to Liability',
       ' Persistent EPS in the Last Four Seasons',
       ' Net profit before tax/Paid-in capital',
       ' Per Share Net profit before tax (Yuan ¥)', ' Debt ratio %',
       ' Net Income to Total Assets', ' Net worth/Assets',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Operating profit/Paid-in capital',
       ' ROA(A) before interest and % after tax', ' Cash/Total Assets',
       ' Quick Assets/Total Assets', ' Cash/Current Liability',
       ' Total Asset Turnover',
       ' ROA(B) before interest and depreciation after tax',
       ' ROA(C) before interest and depreciation before interest',
       ' Fixed Assets Turnover Frequency', ' Working Capital to Total Assets',
       ' Cash Flow to Liability', ' Net Value Per Share (B)',
       ' Net Value Per Share (A)', ' Cash Turnover Rate',
       ' Borrowing dependency', ' Net Value Per Share (C)',
       ' Cash Flow to Total Assets', ' Operating profit per person',
       ' Tax rate (A)', ' Fixed Assets to Assets', ' Total expense/Assets',
       ' CFO to Assets', ' Long-term fund suitability ratio (A)',
       ' Revenue per person', ' Equity to Long-term Liability',
       ' Accounts Receivable Turnover']
"""

X = data.drop('Bankrupt?', axis=1)#[selected_features]
y = data['Bankrupt?']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# base models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
svm_model = SVC(kernel='rbf', probability=True, random_state=42)

rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)


from sklearn.linear_model import LogisticRegression

# predictions from base models
rf_pred = rf_model.predict_proba(X_train)[:, 1]
gb_pred = gb_model.predict_proba(X_train)[:, 1]
svm_pred = svm_model.predict_proba(X_train)[:, 1]

# stack predictions as meta-features
meta_features = np.column_stack((rf_pred, gb_pred, svm_pred))

# train meta-model
meta_model = LogisticRegression()
meta_model.fit(meta_features, y_train)

# predictions from base models (on test set)
rf_pred_test = rf_model.predict_proba(X_test)[:, 1]
gb_pred_test = gb_model.predict_proba(X_test)[:, 1]
svm_pred_test = svm_model.predict_proba(X_test)[:, 1]

meta_features_test = np.column_stack((rf_pred_test, gb_pred_test, svm_pred_test))

stacking_pred = meta_model.predict(meta_features_test)

accuracy = accuracy_score(y_test, stacking_pred)
print(f"Stacking model accuracy: {accuracy}")

# estimated
estimated_bankrupt_count = np.sum(stacking_pred == 1)
estimated_non_bankrupt_count = np.sum(stacking_pred == 0)
# actual
actual_bankrupt_count = np.sum(y_test == 1)
actual_non_bankrupt_count = np.sum(y_test == 0)
print(f"num estimated bankrupt: {estimated_bankrupt_count}")
print(f"num estimated non-bankrupt: {estimated_non_bankrupt_count}")
print(f"num actual bankrupt: {actual_bankrupt_count}")
print(f"num actual non-bankrupt: {actual_non_bankrupt_count}")
print(f"num features: {len(selected_features)}")

TT = np.sum((y_test == 1) & (stacking_pred == 1))
FT = np.sum((y_test == 0) & (stacking_pred == 1))
TF = np.sum((y_test == 1) & (stacking_pred == 0))
FF = np.sum((y_test == 0) & (stacking_pred == 0))
print(f"TT: {TT}")
print(f"FT: {FT}")
print(f"TF: {TF}")
print(f"FF: {FF}")


original num features: 35
Stacking model accuracy: 0.963855421686747
num estimated bankrupt: 20
num estimated non-bankrupt: 1142
num actual bankrupt: 38
num actual non-bankrupt: 1124
num features: 28
TT: 8
FT: 12
TF: 30
FF: 1112


# Model 2: k-fold Cross Validation

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

data_path = 'data/filtered_train_data.csv'
data = pd.read_csv(data_path)

X = data.drop('Bankrupt?', axis=1)
y = data['Bankrupt?']

k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

accuracy_scores = []

model = RandomForestClassifier(n_estimators=100, random_state=42)

for train_index, test_index in skf.split(X, y):
    # split into train/test for each fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    fold_accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(fold_accuracy)
    
mean_accuracy = np.mean(accuracy_scores)
print("Avg accuracy:", mean_accuracy)

TT = np.sum((y_test == 1) & (y_pred == 1))
FT = np.sum((y_test == 0) & (y_pred == 1))
TF = np.sum((y_test == 1) & (y_pred == 0))
FF = np.sum((y_test == 0) & (y_pred == 0))
print(f"TT: {TT}")
print(f"FT: {FT}")
print(f"TF: {TF}")
print(f"FF: {FF}")


Avg accuracy: 0.9677981027098428
TT: 7
FT: 6
TF: 33
FF: 1115
