In [159]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, f1_score

In [160]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [161]:
df.shape

(6819, 96)

In [168]:
X = df.drop(['Bankrupt?'], axis=1)
y = df['Bankrupt?']
X = preprocessing.StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [169]:
lm = LogisticRegression(max_iter=10000)
lm.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [170]:
pred = lm.predict(X_test)

acc = accuracy_score(y_test, pred)
matrix = confusion_matrix(y_test, pred)
rocaucscore = roc_auc_score(y_test, pred)
fpr, tpr, thresh = roc_curve(y_test, pred)
f1 = f1_score(y_test, pred)

In [171]:
print(f'Accuracy score: {acc:.4f}, F1 score: {f1:.4f}')

Accuracy score: 0.9619, F1 score: 0.2121


In [172]:
matrix

array([[1305,    9],
       [  43,    7]], dtype=int64)

In [173]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [174]:
X_resampled = preprocessing.StandardScaler().fit_transform(X_resampled)
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1)

In [175]:
lm.fit(X_train_res, y_train_res)
pred = lm.predict(X_test_res)

acc = accuracy_score(y_test_res, pred)
f1 = f1_score(y_test_res, pred)

print(f'Accuracy score: {acc:.4f}, F1 score: {f1:.4f}')

Accuracy score: 0.8879, F1 score: 0.8880


In [176]:
pred = lm.predict(X_test)

acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)

print(f'Accuracy score: {acc:.4f}, F1 score: {f1:.4f}')

Accuracy score: 0.5726, F1 score: 0.1439


In [177]:
y_train_pred = lm.predict(X_train)

acc = accuracy_score(y_train, y_train_pred)
f1 = f1_score(y_train, y_train_pred)

print(f'Accuracy score: {acc:.4f}, F1 score: {f1:.4f}')

Accuracy score: 0.5481, F1 score: 0.1200


In [178]:
from imblearn.over_sampling import ADASYN
ada = ADASYN()
X_resampled, y_resampled = ada.fit_resample(X, y)

In [179]:
X_resampled = preprocessing.StandardScaler().fit_transform(X_resampled)
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=1)

In [180]:
lm.fit(X_train_res, y_train_res)
pred = lm.predict(X_test_res)

acc = accuracy_score(y_test_res, pred)
f1 = f1_score(y_test_res, pred)

print(f'Accuracy score: {acc:.4f}, F1 score: {f1:.4f}')

Accuracy score: 0.8938, F1 score: 0.8960


In [181]:
pred = lm.predict(X_test)

acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)

print(f'Accuracy score: {acc:.4f}, F1 score: {f1:.4f}')

Accuracy score: 0.5770, F1 score: 0.1452


In [182]:
confusion_matrix(y_test, pred)

array([[738, 576],
       [  1,  49]], dtype=int64)