# Import libraries

In [10]:
import os

import pandas as pd

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

# Data Fetching

In [2]:
file_path= os.path.join("../data/data_bankruptcy.csv") # Filepath of CSV file

In [3]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


# Data Cleaning

In [13]:
# drop the space before title of columns
df.columns = [c.replace(" ","", 1) if c.startswith(' ') else c for c in df.columns]

In [14]:
# drop 'Net Income Flag' feature
df = df.drop('Net Income Flag', axis=1)

# Feature Selections

In [15]:
X = df.iloc[:, 1:]  # List of features which are required for model training
y = df['Bankrupt?']  # Target feature for prediction.

# ML Modeling

## TPOT Classifier

### Data Splitting

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

### Preprocessing pipelines

In [20]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, verbosity=2, scoring='recall')
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.3653409090909091

Generation 2 - Current best internal CV score: 0.7731060606060607

Generation 3 - Current best internal CV score: 0.8465909090909092

Generation 4 - Current best internal CV score: 0.8465909090909092

Generation 5 - Current best internal CV score: 0.9450757575757576

Best pipeline: GaussianNB(input_matrix)


TPOTClassifier(generations=5, population_size=10, scoring='recall', verbosity=2)

### Performance metrics

In [21]:
print(pipeline_optimizer.score(X_test, y_test))

0.9642857142857143


### Model export

In [23]:
pipeline_optimizer.export('../models/tpot_company_bankruptcy.py')

### Model Exploration