In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# --- 1. CONFIGURATION & PREPARATION ---
COST_FN = 5  # Cost of missing a Fast Growth firm
COST_FP = 1  # Cost of investigating a false lead

# Load Data
df = pd.read_csv('bisnode_firms_clean_growth.csv')

# Define Industry Groups
mfg_codes = [26, 27, 28, 29, 30]
srv_codes = [33, 55, 56]
df['industry_group'] = np.where(df['ind2'].isin(mfg_codes), 'Manufacturing',
                                np.where(df['ind2'].isin(srv_codes), 'Services', 'Other'))

# Filter analysis set
df_analysis = df[df['industry_group'].isin(['Manufacturing', 'Services'])].copy()


# Define potential features
features_to_use = ['sales_mil_log', 'd1_sales_mil_log', 'age', 'ceo_count', 
                   'gender_m', 'flag_asset_problem', 'foreign_management', 
                   'ind2_cat', 'm_region_loc']

TARGET = 'fast_growth'

# --- 2. CUSTOM SCORER & UTILS ---

def calculate_business_loss(y_true, y_pred):
    """Calculates the specific business loss."""
    cm = confusion_matrix(y_true, y_pred)
    # Handle edge cases
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        # Fallback if model predicts only one class
        tn = fp = fn = tp = 0
        if len(y_true) > 0:
            if y_true.iloc[0] == 0: 
                 fp = np.sum(y_pred)
                 tn = len(y_true) - fp
            else: 
                 fn = np.sum(y_pred == 0)
                 tp = len(y_true) - fn
                 
    return (fn * COST_FN) + (fp * COST_FP)

def find_optimal_threshold_cv(y_true, y_proba_cv):
    """Finds the threshold that minimizes loss based on Cross-Validated probabilities."""
    thresholds = np.linspace(0.01, 0.99, 99)
    costs = []
    
    for t in thresholds:
        y_pred_temp = (y_proba_cv >= t).astype(int)
        loss = calculate_business_loss(y_true, y_pred_temp)
        costs.append(loss)
    
    best_idx = np.argmin(costs)
    return thresholds[best_idx], costs[best_idx]

# --- 3. MODELING LOOP ---

results = {}

print(f"{'Sector':<15} | {'AUC':<6} | {'Threshold':<10} | {'Test Avg Loss':<15}")
print("-" * 65)

for sector in ['Manufacturing', 'Services']:
    
    # A. Data Subset
    data_sector = df_analysis[df_analysis['industry_group'] == sector].copy()
    
    # --- FIX: Dynamic Type Detection for THIS sector ---
    X = data_sector[features_to_use]
    y = data_sector[TARGET]
    
    # Identify types dynamically to avoid "string to float" errors
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    # B. Split Data (Hold-out Test Set)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # C. Build Pipeline (Prevents Leakage)
    # Numeric: Impute Median -> Scale (optional for RF but good practice)
    # Categorical: Impute Constant -> OneHot
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='median'), num_cols),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), cat_cols)
        ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'))
    ])

    # D. Hyperparameter Tuning
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5, 10],
        'classifier__min_samples_leaf': [5, 10]
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Scorer logic: optimization is on AUC, business logic is applied on threshold later
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    try:
        grid_search.fit(X_train, y_train)
    except ValueError as e:
        print(f"Skipping {sector} due to data error: {e}")
        continue
    
    best_model = grid_search.best_estimator_

    # E. Threshold Tuning
    y_train_proba_cv = cross_val_predict(best_model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    best_threshold, min_train_loss = find_optimal_threshold_cv(y_train, y_train_proba_cv)

    # F. Final Evaluation on HOLD-OUT TEST SET
    y_test_proba = best_model.predict_proba(X_test)[:, 1]
    y_test_pred = (y_test_proba >= best_threshold).astype(int)
    
    # Metrics
    test_loss = calculate_business_loss(y_test, y_test_pred)
    avg_test_loss = test_loss / len(y_test)
    auc = roc_auc_score(y_test, y_test_proba)
    cm = confusion_matrix(y_test, y_test_pred)

    print(f"{sector:<15} | {auc:.3f}  | {best_threshold:.3f}      | {avg_test_loss:.4f}")

    # G. Extract Feature Importance
    rf_model = best_model.named_steps['classifier']
    preprocessor_step = best_model.named_steps['preprocessor']
    
    # Get encoded categorical names
    try:
        cat_names = preprocessor_step.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)
        all_feature_names = num_cols + list(cat_names)
    except:
        all_feature_names = [f"Feature {i}" for i in range(len(rf_model.feature_importances_))]
    
    results[sector] = {
        'model': best_model,
        'cm': cm,
        'feature_importances': pd.Series(rf_model.feature_importances_, index=all_feature_names),
        'params': grid_search.best_params_
    }

# --- 4. DETAILED OUTPUT ---
for sector, res in results.items():
    print(f"\n>>> {sector.upper()} ANALYSIS")
    print("Confusion Matrix (Test Set):")
    print(res['cm'])
    print("Top 5 Drivers:")
    print(res['feature_importances'].sort_values(ascending=False).head(5))

Sector          | AUC    | Threshold  | Test Avg Loss  
-----------------------------------------------------------------
Manufacturing   | 0.626  | 0.420      | 0.6741
Services        | 0.672  | 0.460      | 0.6715

>>> MANUFACTURING ANALYSIS
Confusion Matrix (Test Set):
[[240 361]
 [ 30 127]]
Top 5 Drivers:
age                 0.283741
d1_sales_mil_log    0.282239
sales_mil_log       0.250923
ind2_cat            0.077625
ceo_count           0.024400
dtype: float64

>>> SERVICES ANALYSIS
Confusion Matrix (Test Set):
[[1365 1113]
 [ 187  385]]
Top 5 Drivers:
age                 0.370706
sales_mil_log       0.310829
d1_sales_mil_log    0.174474
ind2_cat            0.065455
gender_m_male       0.014296
dtype: float64
