In [15]:
import pandas as pd
import numpy as np
import joblib

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split

### Reading the data

In [17]:
RNDM_SEED = 12
df = pd.read_csv('cluster_1_train_data.csv')
df.head()

Unnamed: 0,Index,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,...,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability,Cluster_ID
0,2,0,0.57115,0.620148,0.624177,0.612275,0.612282,0.999163,0.797654,0.809533,...,0.006022,0.625306,0.612271,0.843294,0.278927,0.026816,0.565276,1,0.033034,1
1,9,0,0.522742,0.579536,0.575566,0.619114,0.619114,0.999075,0.797525,0.809443,...,0.002259,0.624266,0.619116,0.841468,0.275937,0.026791,0.565158,1,0.0873,1
2,13,0,0.590114,0.611753,0.634991,0.611403,0.611403,0.999206,0.797672,0.809558,...,0.009663,0.62283,0.611404,0.842483,0.275332,0.026791,0.565158,1,0.159042,1
3,14,0,0.501877,0.5477,0.562557,0.647782,0.647782,0.999076,0.797459,0.809416,...,0.000787,0.62435,0.647779,0.840367,0.27646,0.027138,0.566402,1,0.064648,1
4,16,0,0.480866,0.546991,0.529953,0.595994,0.594863,0.998926,0.797466,0.809373,...,0.007207,0.624691,0.595993,0.840578,0.276414,0.026797,0.565186,1,0.066087,1


In [18]:
y = df['Bankrupt?']
X = df.drop(columns = ['Index', 'Bankrupt?', 'Cluster_ID'])

print(f'shape of y: {y.shape}')
print(f'shape of X: {X.shape}')

shape of y: (1693,)
shape of X: (1693, 95)


### Preprocessing


In [19]:
# Lets scale the data  
median_imputer = SimpleImputer(strategy='median')
X_no_missing = median_imputer.fit_transform(X)

std_scaler = StandardScaler()
X_final = std_scaler.fit_transform(X_no_missing)

n_features = X.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RNDM_SEED)

In [20]:
n_features

95

In [21]:
# Lets check how many values actually are 1
ones = y.value_counts()
ones

Bankrupt?
0    1690
1       3
Name: count, dtype: int64

### Building model

In [22]:
# Let's use random forest and logistic regression since they are robust to imbalance data
estimators = [
    ('rf', RandomForestClassifier(
        n_estimators=50, 
        random_state=42, 
        class_weight='balanced'
    )),
    ('lr', LogisticRegression(
        random_state=42, 
        class_weight='balanced', 
        max_iter=1000
    ))
]

clf = VotingClassifier(estimators=estimators, voting='soft')

clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)[:, 1]

In [23]:
def get_constrained_threshold(y_true, y_proba, constraint=0.20):
    """
    This function basically finds threshold while making sure
    that Recall is maximised and Positive Prediction Rate is low.
    """
    best_thresh = 0.5
    best_recall = -1
    
    # Iteratinf from high to low
    thresholds = np.arange(0.1, 0.99, 0.05)
    
    print(f"\n--- Tuning Threshold (Max Positive Rate: {constraint:.0%}) ---")

    for thresh in thresholds:
        y_pred = (y_proba >= thresh).astype(int)
        
        #Calculating positive rate
        pos_rate = np.mean(y_pred)
        
        if pos_rate >= constraint:
            continue 
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh

    print(f"  > Best Threshold: {best_thresh:.2f} (Recall: {best_recall:.3f})")
    return best_thresh

best_threshold = get_constrained_threshold(y_test, y_proba)


--- Tuning Threshold (Max Positive Rate: 20%) ---
  > Best Threshold: 0.25 (Recall: 0.000)


### Making predictions

In [24]:
y_final_pred = (y_proba >= best_threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_final_pred, labels=[0,1]).ravel()

results_data = {
    "Subgroup_ID": 1,
    "Num_Companies": len(y),
    "Num_Bankrupt": y.sum(),
    "Stacking_TT_Correct": tp,
    "Stacking_TF_Missed": fn,
    "N_Features": n_features
}

results_df = pd.DataFrame([results_data])

print("\n=== TABLE 3 DATA ===")
print(results_df.T)


=== TABLE 3 DATA ===
                        0
Subgroup_ID             1
Num_Companies        1693
Num_Bankrupt            3
Stacking_TT_Correct     0
Stacking_TF_Missed      0
N_Features             95


### Saving the model

In [25]:
artifact = {
    'model': clf,
    'preprocessing': {
        'scaler': std_scaler,
        'imputer': median_imputer
    },
    'params': {
        'threshold': best_threshold,
        'columns': X.columns.tolist()
    }
}

# Saving
fname = f"member_model_cluster_1.joblib"
joblib.dump(artifact, fname)
print(f"Saved: {fname}")

Saved: member_model_cluster_1.joblib
