In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, log_loss
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.special import softmax
import json
#from ucimlrepo import fetch_ucirepo
#spambase = fetch_ucirepo(id=94)

# === User configuration ===
FILE_PATH = "/content/loan_data-noise-20.csv"  # Path to your data file (.csv or .data)
TARGET_COLUMN = 'loan_status'          # Name of the target column, or None to auto-detect
TEST_SIZE = 0.2                    # Fraction of data to reserve for testing
VAL_SIZE = 0.25                    # Fraction of train+val data to reserve for validation
N_ESTIMATORS = 30                 # Number of trees in each Random Forest
RANDOM_STATE = 42                  # Random seed for reproducibility
TEMPERATURE = 1.0                  # Softmax temperature (<1 -> sharper; >1 -> smoother)
COMPOSITE_WEIGHTS = [1.0, 1.0, 1.0, 1.0]  # [accuracy, precision, recall, f1]


def load_data(file_path):
    """
    Load dataset by file extension. Supports .csv and .data as CSVs.
    """
    ext = os.path.splitext(file_path)[1].lower()
    if ext in ['.csv', '.data']:
        return pd.read_csv(file_path)
    raise ValueError(f"Unsupported file extension: {ext}")


def detect_target_column(df):
    """
    Auto-detect target column: prefers 'target', then 'class', otherwise last column.
    """
    if 'target' in df.columns:
        return 'target'
    if 'class' in df.columns:
        return 'class'
    return df.columns[-1]


def evaluate_random_forests(df, target_column,
                            test_size=TEST_SIZE, val_size=VAL_SIZE,
                            n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE,
                            temperature=TEMPERATURE, composite_weights=COMPOSITE_WEIGHTS):
    """
    Trains one Random Forest and evaluates:
      - Standard RF with majority voting
      - Composite-metric weighted RF using multiple metrics per tree
    Returns metrics for RF, plus train/test splits, label encoder, and trained RF.
    """
    # Prepare features and labels
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Identify categorical columns for one-hot encoding
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numerical_cols = X.select_dtypes(include=np.number).columns

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ],
        remainder='passthrough' # Keep other columns (if any)
    )

    # Encode string labels to integers
    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    # Split into train+val and test
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y_enc, test_size=test_size, random_state=random_state, stratify=y_enc
    )
    # Split train and val
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size, random_state=random_state, stratify=y_train_val
    )

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_val_processed = preprocessor.transform(X_val)
    X_test_processed = preprocessor.transform(X_test)


    # Train a single Random Forest
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf.fit(X_train_processed, y_train)

    # --- Standard evaluation ---
    proba_std = rf.predict_proba(X_test_processed)
    log_std = log_loss(y_test, proba_std)
    y_pred_std = np.argmax(proba_std, axis=1)
    metrics_standard = classification_report(
        le.inverse_transform(y_test),
        le.inverse_transform(y_pred_std),
        output_dict=True
    )

    # --- Composite-metric weighting ---
    # Compute per-tree metrics on validation set
    tree_acc, tree_prec, tree_rec, tree_f1 = [], [], [], []
    for tree in rf.estimators_:
        # Need to pass processed data to the tree
        preds_val = tree.predict(X_val_processed)
        tree_acc.append(accuracy_score(y_val, preds_val))
        tree_prec.append(precision_score(y_val, preds_val, average='weighted', zero_division=0))
        tree_rec.append(recall_score(y_val, preds_val, average='weighted', zero_division=0))
        tree_f1.append(f1_score(y_val, preds_val, average='weighted', zero_division=0))
    metrics_matrix = np.vstack([tree_acc, tree_prec, tree_rec, tree_f1]).T
    composite_scores = metrics_matrix.dot(np.array(composite_weights))
    weights = softmax(composite_scores / temperature)

    # Aggregate weighted probabilities
    proba_weighted = np.zeros_like(proba_std)
    for w, tree in zip(weights, rf.estimators_):
        # Need to pass processed data to the tree
        proba_weighted += w * tree.predict_proba(X_test_processed)
    log_weighted = log_loss(y_test, proba_weighted)
    y_pred_w = np.argmax(proba_weighted, axis=1)
    metrics_weighted = classification_report(
        le.inverse_transform(y_test),
        le.inverse_transform(y_pred_w),
        output_dict=True
    )

    return (metrics_standard, metrics_weighted, log_std, log_weighted,
            composite_scores, weights, X_train_processed, y_train, X_test_processed, y_test, le, rf, preprocessor)


def print_summary(metrics, title, logloss=None):
    """
    Prints overall accuracy, weighted-average precision, recall, F1, and optional log-loss.
    """
    accuracy = metrics.get('accuracy')
    weighted_avg = metrics.get('weighted avg', {})
    precision = weighted_avg.get('precision')
    recall = weighted_avg.get('recall')
    f1 = weighted_avg.get('f1-score')

    print(f"\n{title} Summary:")
    if accuracy is not None:
        print(f"  Accuracy : {accuracy:.4f}")
    if precision is not None:
        print(f"  Precision: {precision:.4f}")
    if recall is not None:
        print(f"  Recall   : {recall:.4f}")
    if f1 is not None:
        print(f"  F1-score : {f1:.4f}")
    if logloss is not None:
        print(f"  Log-loss : {logloss:.4f}")


def main():
    # Load data
    df = load_data(FILE_PATH)
    target_col = TARGET_COLUMN or detect_target_column(df)
    print(f"Auto-detected target column: {target_col}\n")

    # Evaluate Random Forests and get splits
    (metrics_standard, metrics_weighted,
     log_std, log_weighted,
     composite_scores, weights,
     X_train_processed, y_train, X_test_processed, y_test,
     le, rf, preprocessor) = evaluate_random_forests(df, target_col)

    # Full reports for RF models
    print("Standard Random Forest Performance:")
    print(json.dumps(metrics_standard, indent=2))
    print(f"\nStandard RF Log Loss: {log_std:.4f}\n")

    print("Composite-Metric Softmax RF Performance:")
    print(json.dumps(metrics_weighted, indent=2))
    print(f"\nComposite Softmax RF Log Loss: {log_weighted:.4f}\n")

    # Display composite scores and weight distribution
    print(f"Composite Scores (first 10 trees): {np.round(composite_scores[:10], 4)}")
    print(f"Softmax Weights   (first 10 trees): {np.round(weights[:10], 4)}")

    # Concise summaries including log-loss
    print_summary(metrics_standard, "Standard Random Forest", log_std)
    print_summary(metrics_weighted, "Composite-Metric RF (softmax)", log_weighted)

if __name__ == "__main__":
    main()

Auto-detected target column: loan_status

Standard Random Forest Performance:
{
  "0": {
    "precision": 0.9244249726177437,
    "recall": 0.9645714285714285,
    "f1-score": 0.9440715883668904,
    "support": 7000.0
  },
  "1": {
    "precision": 0.8537735849056604,
    "recall": 0.724,
    "f1-score": 0.7835497835497836,
    "support": 2000.0
  },
  "accuracy": 0.9111111111111111,
  "macro avg": {
    "precision": 0.889099278761702,
    "recall": 0.8442857142857143,
    "f1-score": 0.863810685958337,
    "support": 9000.0
  },
  "weighted avg": {
    "precision": 0.9087246642372807,
    "recall": 0.9111111111111111,
    "f1-score": 0.9084000761853112,
    "support": 9000.0
  }
}

Standard RF Log Loss: 0.2448

Composite-Metric Softmax RF Performance:
{
  "0": {
    "precision": 0.9267419443679427,
    "recall": 0.9614285714285714,
    "f1-score": 0.943766652643388,
    "support": 7000.0
  },
  "1": {
    "precision": 0.8446490218642118,
    "recall": 0.734,
    "f1-score": 0.78544676