In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [13]:
df = pd.read_csv("data.csv").copy()

In [14]:
corr = df.corrwith(df["Bankrupt?"])
abs_corr = np.abs(corr)  # Take absolute values
sorted_indices = np.argsort(-abs_corr)  # Sort in descending order of absolute values

sorted_corr = corr[sorted_indices]
sorted_features = df.columns[sorted_indices]

num_features_to_keep = int(len(sorted_features) * 0.5)  # Keep top 50%

sorted_corr_top = sorted_corr[:num_features_to_keep]
sorted_features_top = sorted_features[:num_features_to_keep]

result_df = pd.DataFrame({"Feature": sorted_features_top, "Correlation": sorted_corr_top})
new_df = df[sorted_features_top]
result_df

Unnamed: 0,Feature,Correlation
Bankrupt?,Bankrupt?,1.0
Net Income to Total Assets,Net Income to Total Assets,-0.315457
ROA(A) before interest and % after tax,ROA(A) before interest and % after tax,-0.282941
ROA(B) before interest and depreciation after tax,ROA(B) before interest and depreciation after...,-0.273051
ROA(C) before interest and depreciation before interest,ROA(C) before interest and depreciation befor...,-0.260807
Net worth/Assets,Net worth/Assets,-0.250161
Debt ratio %,Debt ratio %,0.250161
Persistent EPS in the Last Four Seasons,Persistent EPS in the Last Four Seasons,-0.21956
Retained Earnings to Total Assets,Retained Earnings to Total Assets,-0.217779
Net profit before tax/Paid-in capital,Net profit before tax/Paid-in capital,-0.207857


In [15]:
# Define attributes and target variable
features = sorted_features_top
target = "Bankrupt?"

X = new_df[features]
y = df[target]

In [16]:
# Split the data into training set and temporary set
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary set into validation set and final training set
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=42)

In [17]:
# Build the naive bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

# Predict the labels for the validation data
y_val_pred = naive_bayes.predict(X_val)

# Show the results on validation set
print("Confusion Matrix:", "\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:","\n", classification_report(y_val, y_val_pred))

Confusion Matrix: 
 [[   0 1323]
 [   0   41]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      1323
           1       0.03      1.00      0.06        41

    accuracy                           0.03      1364
   macro avg       0.02      0.50      0.03      1364
weighted avg       0.00      0.03      0.00      1364



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Feature selection and further remove unless attributes to improve the performance
updated_num_features_to_keep = int(len(sorted_features) * 0.3)  # Keep top 30%

updated_sorted_corr_top = sorted_corr[:updated_num_features_to_keep]
updated_sorted_features_top = sorted_features[:updated_num_features_to_keep]

updated_result_df = pd.DataFrame({"Feature": updated_sorted_features_top, "Correlation": updated_sorted_corr_top})
updated_new_df = df[updated_sorted_features_top]
updated_result_df

Unnamed: 0,Feature,Correlation
Bankrupt?,Bankrupt?,1.0
Net Income to Total Assets,Net Income to Total Assets,-0.315457
ROA(A) before interest and % after tax,ROA(A) before interest and % after tax,-0.282941
ROA(B) before interest and depreciation after tax,ROA(B) before interest and depreciation after...,-0.273051
ROA(C) before interest and depreciation before interest,ROA(C) before interest and depreciation befor...,-0.260807
Net worth/Assets,Net worth/Assets,-0.250161
Debt ratio %,Debt ratio %,0.250161
Persistent EPS in the Last Four Seasons,Persistent EPS in the Last Four Seasons,-0.21956
Retained Earnings to Total Assets,Retained Earnings to Total Assets,-0.217779
Net profit before tax/Paid-in capital,Net profit before tax/Paid-in capital,-0.207857


In [19]:
# Define updated attributes and target variable
updated_features = updated_sorted_features_top
target = "Bankrupt?"

updated_X = updated_new_df[updated_features]
updated_y = df[target]

In [20]:
# Split the data into training set and temporary set
updated_X_train_temp, updated_X_test, updated_y_train_temp, updated_y_test = train_test_split(updated_X, updated_y, test_size=0.2, random_state=42)

# Split the temporary set into validation set and final training set
updated_X_train, updated_X_val, updated_y_train, updated_y_val = train_test_split(updated_X_train_temp, updated_y_train_temp, test_size=0.25, random_state=42)

In [21]:
# Build the updated naive bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(updated_X_train, updated_y_train)

# Predict the labels for the validation data
updated_y_val_pred = naive_bayes.predict(updated_X_val)

# Show the results on validation set
print("Confusion Matrix:", "\n", confusion_matrix(updated_y_val, updated_y_val_pred))
print("Classification Report:","\n", classification_report(updated_y_val, updated_y_val_pred))

Confusion Matrix: 
 [[1323    0]
 [   0   41]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1323
           1       1.00      1.00      1.00        41

    accuracy                           1.00      1364
   macro avg       1.00      1.00      1.00      1364
weighted avg       1.00      1.00      1.00      1364



In [29]:
updated_y_test_pred = naive_bayes.predict(updated_X_test)
print("Confusion Matrix:", "\n", confusion_matrix(updated_y_test, updated_y_test_pred))
print("Classification Report:","\n", classification_report(updated_y_test, updated_y_test_pred))

Confusion Matrix: 
 [[1313    0]
 [   0   51]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1313
           1       1.00      1.00      1.00        51

    accuracy                           1.00      1364
   macro avg       1.00      1.00      1.00      1364
weighted avg       1.00      1.00      1.00      1364



In [25]:
# Build the logistic regression model
try_grid={"C":np.logspace(-4,4,40), "penalty":["l1","l2"]}
logicReg = LogisticRegression(solver='liblinear', max_iter=1000)

# define your Model using GridSearchCV
lr_gridsearch = GridSearchCV(logicReg, try_grid, cv=10, scoring='accuracy')
lr_gridsearch.fit(updated_X_train, updated_y_train)
lr_gridsearch.best_params_

{'C': 0.018047217668271703, 'penalty': 'l1'}

In [26]:
# Fit the training data into the logistic model
log_model = LogisticRegression(solver='liblinear', max_iter=1000, C=0.018047217668271703, penalty="l1")
log_model.fit(updated_X_train, updated_y_train)

In [27]:
updated_y_val_pred= log_model.predict(updated_X_val)
print("Confusion Matrix:", "\n", confusion_matrix(updated_y_val, updated_y_val_pred))
print("Classification Report:","\n", classification_report(updated_y_val, updated_y_val_pred))

Confusion Matrix: 
 [[1323    0]
 [   0   41]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1323
           1       1.00      1.00      1.00        41

    accuracy                           1.00      1364
   macro avg       1.00      1.00      1.00      1364
weighted avg       1.00      1.00      1.00      1364



In [28]:
updated_y_test_pred= log_model.predict(updated_X_test)
print("Confusion Matrix:", "\n", confusion_matrix(updated_y_test, updated_y_test_pred))
print("Classification Report:","\n", classification_report(updated_y_test, updated_y_test_pred))

Confusion Matrix: 
 [[1313    0]
 [   0   51]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1313
           1       1.00      1.00      1.00        51

    accuracy                           1.00      1364
   macro avg       1.00      1.00      1.00      1364
weighted avg       1.00      1.00      1.00      1364



In [None]:
# Define the cost of false negatives (FN) and false positives (FP)
cost_fn = 10_000_000
cost_fp = 1_000_000

# Calculate the F1 scores at different thresholds
thresholds = np.arange(0, 1.0, 0.05)
total_cost = []
for threshold in thresholds:
    # Convert probabilities to binary predictions based on the threshold
    predicted_labels = np.where(updated_y_val_pred >= threshold, 1, 0)

    # Calculate the number of false positives and false negatives
    fp = ((y_val == 0) & (predicted_labels == 1)).sum()
    fn = ((y_val == 1) & (predicted_labels == 0)).sum()

    # Calculate the costs
    cost = fp * cost_fp + fn * cost_fn
    total_cost.append(cost)

# Find the threshold with the minimum cost
optimal_threshold = thresholds[np.argmin(total_cost)]
min_cost = np.min(total_cost)

print("Optimal Threshold:", optimal_threshold)
print("Minimum Cost:", min_cost)