In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [3]:
class bankrupt_prediction:
    """
    bankrupt_data - training set (with companies that bankrupted)
    non_bankrupt_data - training set (with companies that survived)
    inf_data - inferencing dataset
    inferencing - True means predict with inf_data, False means predict with test_data
    -----------------------------------------------------------------------------------
    
    Variables are:
        X1 = working capital / total assets; 
        X2 = retained earnings / total asssets
        X3 = EBIT / total assets
        X4 = total equity (book) / total assets
        X5 = net income / total assets
        X6 = total liabilities / total assets  
        
    """
    def __init__(self, bankrupt_data, non_bankrupt_data, inf_data, inferencing=False):
        self.bankrupt_data =  bankrupt_data
        self.non_bankrupt_data = non_bankrupt_data
        self.inf_data = inf_data
        self.inferencing = inferencing

        # Concatenate data
        data_full = pd.concat([bankrupt_data, non_bankrupt_data], ignore_index=True)

        # Add and scale variables
        data_full["X1"] = preprocessing.scale(data_full["WoCap"] / data_full["ToAsset"])
        data_full["X2"] = preprocessing.scale(data_full["CFOper"] / data_full["ToLia"])
        data_full["X3"] = preprocessing.scale(data_full["EBIT"] / data_full["ToAsset"])
        data_full["X4"] = preprocessing.scale(data_full["ToEqui"] / data_full["ToAsset"])
        data_full["X5"] = preprocessing.scale(data_full["NetInc"] / data_full["ToAsset"])
        data_full["X6"] = preprocessing.scale(data_full["ToLia"] / data_full["ToAsset"])

        inf_data["X1"] = preprocessing.scale(inf_data["WoCap"] / inf_data["ToAsset"])
        inf_data["X2"] = preprocessing.scale(inf_data["CFOper"] / inf_data["ToLia"])
        inf_data["X3"] = preprocessing.scale(inf_data["EBIT"] / inf_data["ToAsset"])
        inf_data["X4"] = preprocessing.scale(inf_data["ToEqui"] / inf_data["ToAsset"])
        inf_data["X5"] = preprocessing.scale(inf_data["NetInc"] / inf_data["ToAsset"])
        inf_data["X6"] = preprocessing.scale(inf_data["ToLia"] / inf_data["ToAsset"])

        # Split data for training and testing
        X = data_full[["X1", "X2", "X3", "X4","X5","X6"]]
        y = data_full['Status'] 
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, random_state=101)
        
        self.tmp_inf_data = inf_data[["Ticker", "X1", "X2", "X3", "X4","X5","X6"]]
        self.tmp_inf_data = clean_dataset(self.tmp_inf_data)
        self.tmp_inf_data.dropna(inplace=True)
        self.cleaned_inf_data = self.tmp_inf_data[["X1", "X2", "X3", "X4","X5","X6"]] # filter out Ticker column
    
    # Supported Vector Machine
    def svm(self):
        model = SVC()
        model.fit(self.X_train,self.y_train)
        predictions = model.predict(self.X_test)
        param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']} 
        grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
        grid.fit(self.X_train,self.y_train)
        grid.best_params_
        grid.best_estimator_
    
        if (self.inferencing):
            svm_pred = svn.predict(self.cleaned_inf_data)
            output_data = {'Ticker': self.tmp_inf_data['Ticker'], 'svm': svm_pred}
            results_df = pd.DataFrame(output_data)
            results_df.Ticker = results_df.Ticker.astype(int)
            
            print("Inferencing done")
            return results_df
            
        else:
            svm_pred = grid.predict(self.X_test)
            print("Confusion Matrix using Supported Vector Machine: \n", confusion_matrix(self.y_test,svm_pred))
            print("Classification Report using Supported Vector Machine: \n", classification_report(self.y_test,svm_pred))

        print("svm_pred")
        print(svm_pred)
    
    
    # Decision Tree
    def decision_tree(self):
        dtree = DecisionTreeClassifier()
        dtree.fit(self.X_train,self.y_train)
        
        if (self.inferencing):
            dtree_pred = dtree.predict(self.cleaned_inf_data)
            output_data = {'Ticker': self.tmp_inf_data['Ticker'], 'dtree': dtree_pred}
            results_df = pd.DataFrame(output_data)
            results_df.Ticker = results_df.Ticker.astype(int)
            
            print("Inferencing done")
            return results_df
        
        else:
            dtree_pred = dtree.predict(self.X_test)
            print("Confusion Matrix using Decision Tree: \n", confusion_matrix(self.y_test,dtree_pred))
            print("Classification Report using Decision Tree: \n", classification_report(self.y_test,dtree_pred))
            
        print("dtree_pred")
        print(dtree_pred)
    
    
    # Random Forest
    def random_forest(self):
        rf = RandomForestClassifier(n_estimators=100)
        rf.fit(self.X_train,self.y_train)
        
        if (self.inferencing):
            rf_pred = rf.predict(self.cleaned_inf_data)
            output_data = {'Ticker': self.tmp_inf_data['Ticker'], 'rf': rf_pred}
            results_df = pd.DataFrame(output_data)
            results_df.Ticker = results_df.Ticker.astype(int)
            
            print("Inferencing done")
            return results_df
        
        else:
            rf_pred = rf.predict(self.X_test)
            print("Confusion Matrix using Decision Tree: \n", confusion_matrix(self.y_test,rf_pred))
            print("Classification Report using Decision Tree: \n", classification_report(self.y_test,rf_pred))
            
        print("rf_pred")
        print(rf_pred)
    
    
    # K-nearest neighbor
    def knn(self):
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(self.X_train,self.y_train)
        
        if (self.inferencing):
            knn_pred = knn.predict(self.cleaned_inf_data)
            output_data = {'Ticker': self.tmp_inf_data['Ticker'], 'knn': knn_pred}
            results_df = pd.DataFrame(output_data)
            results_df.Ticker = results_df.Ticker.astype(int)
            
            print("Inferencing done")
            return results_df
        
        else:
            knn_pred = knn.predict(self.X_test)
            print("Confusion Matrix using Decision Tree: \n", confusion_matrix(self.y_test,knn_pred))
            print("Classification Report using Decision Tree: \n", classification_report(self.y_test,knn_pred))
            
        print("knn_pred")
        print(knn_pred)

### Bankruptcy prediction at t-1
Predict whether the company will go bankrupt in 1 year1. (0 = bankrupt, 1 = survive)

In [4]:
# load dataset and input file
bankrupt_t1 = pd.read_csv("./ucla-dataset/bankrupt_t1.csv").fillna(value = 1)
non_bankrupt_t1 = pd.read_csv("./ucla-dataset/non_bankrupt_t1.csv").fillna(value = 1)

input_df = pd.read_csv("input-data-ML.csv", dtype={'Ticker': 'int'})

# data pre-processing
input_df.dropna() # drop rows with empty entries
input_df.fillna(value = 1)
input_df = input_df[input_df['ToLia'] != 0]
input_df.drop(input_df[input_df['NetInc'] == '-'].index, inplace=True)
input_df["NetInc"] = pd.to_numeric(input_df["NetInc"], downcast="float")
# print(input_df.info())

# check if any col contain '-'
# tmp = input_df.isin(['-']).any()
# print(tmp)

### Evaluate performance with test set

In [5]:
model = bankrupt_prediction(bankrupt_t1, non_bankrupt_t1, input_df, False)

#t1_results = model.svm()
#t1_results = model.decision_tree()
#t1_results = model.random_forest()
t1_results = model.knn()

Confusion Matrix using Decision Tree: 
 [[7 2]
 [2 9]]
Classification Report using Decision Tree: 
               precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       0.82      0.82      0.82        11

    accuracy                           0.80        20
   macro avg       0.80      0.80      0.80        20
weighted avg       0.80      0.80      0.80        20

knn_pred
[0 1 1 0 1 0 0 1 1 1 0 0 0 1 1 1 0 1 1 0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Inferencing

In [7]:
model.inferencing = True # set Inferencing as True

#t1_results = model.svm()
#t1_results = model.decision_tree()
#t1_results = model.random_forest()
t1_results = model.knn()
t1_results.to_csv("./results/t1_results-knn.csv", header='column_names', index=False) # save inferencing results

Inferencing done


### Bankruptcy prediction at t-2
Predict whether the company will go bankrupt in 2 years. (0 = bankrupt, 1 = survive)

In [8]:
# load dataset and input file
bankrupt_t2 = pd.read_csv("./ucla-dataset/bankrupt_t2.csv").fillna(value = 1)
non_bankrupt_t2 = pd.read_csv("./ucla-dataset/non_bankrupt_t2.csv").fillna(value = 1)

input_df = pd.read_csv("input-data-ML.csv", dtype={'Ticker': 'int'})

input_df.dropna() # drop rows with empty entries
input_df.fillna(value = 1)
input_df = input_df[input_df['ToLia'] != 0]

# data pre-processing
input_df.dropna() # drop rows with empty entries
input_df.fillna(value = 1)
input_df = input_df[input_df['ToLia'] != 0]
input_df.drop(input_df[input_df['NetInc'] == '-'].index, inplace=True)
input_df["NetInc"] = pd.to_numeric(input_df["NetInc"], downcast="float")
# print(input_df.info())

# m = input_df.isin(['-']).any()
# print(m)

# print(input_df.dtypes)

### Evaluate performance with test set

In [9]:
model = bankrupt_prediction(bankrupt_t2, non_bankrupt_t2, input_df, False)

#t2_results = model.svm()
#t2_results = model.decision_tree()
#t2_results = model.random_forest()
t2_results = model.knn()

Confusion Matrix using Decision Tree: 
 [[7 2]
 [3 8]]
Classification Report using Decision Tree: 
               precision    recall  f1-score   support

         0.0       0.70      0.78      0.74         9
         1.0       0.80      0.73      0.76        11

    accuracy                           0.75        20
   macro avg       0.75      0.75      0.75        20
weighted avg       0.76      0.75      0.75        20

knn_pred
[0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1.]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Inferencing

In [11]:
model.inferencing = True # set Inferencing as True

#t2_results = model.svm()
#t2_results = model.decision_tree()
#t2_results = model.random_forest()
t2_results = model.knn()
t2_results.to_csv("./results/t2_results-knn.csv", header='column_names', index=False) # save inferencing results

Inferencing done


### Bankruptcy prediction at t-3
Predict whether the company will go bankrupt in 3 years. (0 = bankrupt, 1 = survive)

In [12]:
# load dataset and input file
bankrupt_t3 = pd.read_csv("./ucla-dataset/bankrupt_t3.csv").fillna(value = 1)
non_bankrupt_t3 = pd.read_csv("./ucla-dataset/non_bankrupt_t3.csv").fillna(value = 1)

input_df = pd.read_csv("input-data-ML.csv", dtype={'Ticker': 'int'})

# data pre-processing
input_df.dropna() # drop rows with empty entries
input_df.fillna(value = 1)
input_df = input_df[input_df['ToLia'] != 0]
input_df.drop(input_df[input_df['NetInc'] == '-'].index, inplace=True)
input_df["NetInc"] = pd.to_numeric(input_df["NetInc"], downcast="float")
# print(input_df.info())
# m = input_df.isin(['-']).any()
# print(m)

# print(input_df.dtypes)

### Evaluate performance with test set

In [14]:
model = bankrupt_prediction(bankrupt_t3, non_bankrupt_t3, input_df, False)

#t3_results = model.svm()
#t3_results = model.decision_tree()
#t3_results = model.random_forest()
t3_results = model.knn()

Confusion Matrix using Decision Tree: 
 [[5 4]
 [4 7]]
Classification Report using Decision Tree: 
               precision    recall  f1-score   support

           0       0.56      0.56      0.56         9
           1       0.64      0.64      0.64        11

    accuracy                           0.60        20
   macro avg       0.60      0.60      0.60        20
weighted avg       0.60      0.60      0.60        20

knn_pred
[1 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Inferencing

In [15]:
model.inferencing = True # set Inferencing as True

#t3_results = model.svm()
#t3_results = model.decision_tree()
#t3_results = model.random_forest()
t3_results = model.knn()
t3_results.to_csv("./results/t3_results-knn.csv", header='column_names', index=False) # save inferencing results

Inferencing done
