In [1]:
#Imports for artificial neural network
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree


np.random.seed(42)  #Seed is necessary to ensure consistent replication of results

df = pd.read_csv('data.csv')

In [2]:
#Now build decision tree classifiers
def data_info():
    df.info()
    null_counts = df.isnull().sum()
    null_columns = null_counts[null_counts > 0]
    return df.describe().transpose()

df.isnull().sum().sum()

Y=df["Bankrupt?"]
X=df.drop(["Bankrupt?"],axis=1)

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X,Y)

importance_features = rf.feature_importances_
important_features_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance_features})
important_features_df_desc =important_features_df.sort_values(by='Importance', ascending=False)
important_features_df_asc =important_features_df.sort_values(by='Importance', ascending=True)
important_features_df_desc.head(40)
required = important_features_df_desc[important_features_df_desc.Importance>0.0105]
required_features = df[required.Feature]  
correlation_matrix = df.corr()
target_corr = correlation_matrix["Bankrupt?"].abs().sort_values(ascending=False)
target_corr = target_corr.drop("Bankrupt?")
selected_features = target_corr[target_corr>=0.1]
selected_columns = list(selected_features.index)
df_selected = df[selected_columns]
actual_corr = correlation_matrix["Bankrupt?"][selected_features.index]
positive_corr = actual_corr[actual_corr >= 0].sort_values(ascending=False)
negative_corr = actual_corr[actual_corr < 0].sort_values(ascending=True)
sorted_correlations = pd.concat([positive_corr, negative_corr])

import seaborn as sns

selexted_X = df_selected
selected_y = df["Bankrupt?"]
X_train,X_test,y_train,y_test = train_test_split(selexted_X,selected_y,test_size = 0.2,random_state=42)

from sklearn.model_selection import GridSearchCV
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [10, 15, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(max_depth =5,min_samples_split = 20,random_state=42)
grid_search_dt = GridSearchCV(estimator=dt_classifier, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search_dt.fit(X_train,y_train)

grid_search_dt.best_estimator_

best_dt = grid_search_dt.best_estimator_

dt_train_pred = best_dt.predict(X_train)
dt_test_pred = best_dt.predict(X_test)

tree_info = best_dt.tree_
num_nodes = tree_info.node_count
num_leaves = tree_info.n_leaves
num_decision = num_nodes - num_leaves

rf_classifier = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf_classifier, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search_rf.fit(X_train,y_train)


grid_search_rf.best_estimator_

best_rf = grid_search_rf.best_estimator_

y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

#Classifier 3
classifier1 = pd.DataFrame(y_train_pred, index=X_train.index, columns=['Random Forest'])
#np.set_printoptions(threshold=np.inf)

#print('Classifier 1: ', classifier1)
#print('Classifier 3: ', classifier3)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits


In [3]:
#Additional imports for linear regression model
import statsmodels.api as sm
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

target = 'Bankrupt?'
predictors = [col for col in df.columns if col != target]

#forward selection
def forward_selection (df, target, predictors):
    features = []
    best_features = []
    while True: 
        features_left = list(set(predictors) - set(features))
        new_val = pd.Series(index = features_left)
        for new_col in tqdm(features_left, desc="Processing Columns"):
            model = sm.OLS(df[target], sm.add_constant(df[features + [new_col]])).fit()
            new_val[new_col] = model.pvalues[new_col]
        min_p_value = new_val.min()
        if min_p_value < 0.05:
            best_feature = new_val.idxmin()
            features.append(best_feature)
            best_features.append(best_feature)
        else:
            break
        return best_features

best_predictors = forward_selection(df, target, list(X_train.columns))

final_model = sm.OLS(y_train, sm.add_constant(X_train[best_predictors])).fit()

#find accuracy 
linRegPred = final_model.predict(sm.add_constant(X_train[best_predictors]))

#Classifier 2
classifier2 = pd.DataFrame(linRegPred, columns=['linear Regression'])

assert classifier1.index.equals(classifier2.index), "Indices do not match!"



  new_val = pd.Series(index = features_left)
Processing Columns: 100%|█████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 323.24it/s]


In [4]:
#Now we get the classifier for logistic regression

In [5]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
#data = pd.read_csv("/Users/ambrasina/Downloads/data3.csv")

# Define the features (X) and the target (y)
#X = data.drop(columns=['Bankrupt?'])
#y = data['Bankrupt?']

# Split the dataset into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model with class_weight='balanced'
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Train the model
model.fit(X_train, y_train)

# Predict the target on the test set
y_pred = model.predict(X_test)

classifier3 = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=['Logistic Regression'])

print('classifier1: ', classifier1)
print('classifier2: ', classifier2)
print('classifier3: ', classifier3)

assert classifier3.index.equals(classifier2.index), "Indices do not match!"

combined_df = pd.concat([classifier1, classifier2, classifier3], axis=1)

print(combined_df.head())

classifier1:        Random Forest
3759              0
1782              0
5013              0
5412              0
3066              0
...             ...
3772              0
5191              0
5226              0
5390              0
860               0

[5455 rows x 1 columns]
classifier2:        linear Regression
3759           0.039881
1782           0.027167
5013           0.028720
5412           0.028267
3066           0.035477
...                 ...
3772          -0.006723
5191          -0.016226
5226           0.024276
5390          -0.032160
860           -0.024261

[5455 rows x 1 columns]
classifier3:        Logistic Regression
3759                    0
1782                    0
5013                    0
5412                    0
3066                    0
...                   ...
3772                    0
5191                    0
5226                    0
5390                    0
860                     0

[5455 rows x 1 columns]
      Random Forest  linear Regression  Log