## Finding fast growing firms 2025

In [None]:
import os
import datetime
import pandas as pd
import numpy as np
import sys
import patsy
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import sklearn.metrics as metrics
from sklearn.metrics import brier_score_loss, roc_curve, auc, confusion_matrix, roc_auc_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

## Data

In [None]:
path = os.path.join(os.pardir, 'data', 'bisnode_firms_clean.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.columns.tolist()

## Define helper functions

In [None]:
# ROC Curve 
def plot_roc_curve(y_true, probs):
    fpr, tpr, _ = roc_curve(y_true, probs)
    auc = roc_auc_score(y_true, probs)

    fig, ax = plt.subplots(figsize=(6, 6))
    ax.plot(fpr, tpr, color='k', linewidth=0.8, label=f"AUC = {auc:.3f}")
    ax.plot([0, 1], [0, 1], linestyle=':', color='black')

    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.legend(frameon=False)

    ax.grid(True, alpha=0.2)
    plt.tight_layout()
    return fig, ax


In [None]:
# Expected loss
def expected_loss(y_true, y_pred, fp_cost, fn_cost):
    FP = ((y_pred == 1) & (y_true == 0)).sum()
    FN = ((y_pred == 0) & (y_true == 1)).sum()
    return fp_cost * FP + fn_cost * FN


In [None]:
# Finding the optimal threshold
def find_optimal_threshold(y_true, probs, fp_cost, fn_cost, grid=None):
    if grid is None:
        grid = np.linspace(0.05, 0.95, 91)

    losses = []

    for t in grid:
        y_pred = (probs >= t).astype(int)
        loss = expected_loss(y_true, y_pred, fp_cost, fn_cost)
        losses.append(loss)

    best_idx = np.argmin(losses)
    return grid[best_idx], losses[best_idx]


In [None]:
#CV expected loss
def cv_expected_loss(model, X, y, fp_cost, fn_cost, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=1)
    fold_losses = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        probs = model.predict_proba(X_test)[:, 1]

        _, loss = find_optimal_threshold(
            y_test, probs, fp_cost, fn_cost
        )
        fold_losses.append(loss)

    return np.mean(fold_losses)


In [None]:
# Confusion matrix
def confusion_table(y_true, probs, threshold):
    y_pred = (probs >= threshold).astype(int)
    return pd.crosstab(
        y_true, y_pred,
        rownames=['Actual fast growth'],
        colnames=['Predicted fast growth']
    )


## Define variable sets 

In [None]:
rawvars = ["curr_assets", "curr_liab", "extra_exp", "extra_inc", "extra_profit_loss", "fixed_assets",
              "inc_bef_tax", "intang_assets", "inventories", "liq_assets", "material_exp", "personnel_exp",
              "profit_loss_year", "sales", "share_eq", "subscribed_cap"]

qualityvars = ["balsheet_flag", "balsheet_length", "balsheet_notfullyear"]

engvar = ["total_assets_bs", "fixed_assets_bs", "liq_assets_bs", "curr_assets_bs",
            "share_eq_bs", "subscribed_cap_bs", "intang_assets_bs", "extra_exp_pl",
            "extra_inc_pl", "extra_profit_loss_pl", "inc_bef_tax_pl", "inventories_pl",
            "material_exp_pl", "profit_loss_year_pl", "personnel_exp_pl"]

engvar2 = ["extra_profit_loss_pl_quad", "inc_bef_tax_pl_quad",
             "profit_loss_year_pl_quad", "share_eq_bs_quad"]

engvar3 = []
for col in df.columns:
    if col.endswith('flag_low') or col.endswith('flag_high') or col.endswith('flag_error') or col.endswith('flag_zero'):
        engvar3.append(col)

d1 =  ["d1_sales_mil_log_mod", "d1_sales_mil_log_mod_sq",
         "flag_low_d1_sales_mil_log", "flag_high_d1_sales_mil_log"]

hr = ["female", "ceo_age", "flag_high_ceo_age", "flag_low_ceo_age",
        "flag_miss_ceo_age", "ceo_count", "labor_avg_mod",
        "flag_miss_labor_avg", "foreign_management"]