In [None]:
import os
import time
import copy
import joblib
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_iris

import warnings
warnings.filterwarnings("ignore")

In [None]:
def benchmark_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    N_RUNS = 100
    start = time.perf_counter()
    for _ in range(N_RUNS):
        model.predict(X_test)
    end = time.perf_counter()

    total_inference_time = end - start
    avg_time_per_sample_us = (
        total_inference_time / (N_RUNS * len(X_test))
    ) * 1e6

    filename = f"{model_name}.joblib"
    joblib.dump(model, filename)
    model_size_bytes = os.path.getsize(filename)

    return {
        "accuracy": acc,
        "avg_inference_time_us": avg_time_per_sample_us,
        "model_size_bytes": model_size_bytes
    }

X_bin, y_bin = load_breast_cancer(return_X_y=True)[:1000]

Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_bin, y_bin, test_size=0.2, random_state=42
)

rf_binary = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf_binary.fit(Xb_train, yb_train)

binary_results = benchmark_model(
    rf_binary, Xb_test, yb_test, "rf_binary"
)

print("=== Binary Classification (Breast Cancer) ===")
print(f"Accuracy: {binary_results['accuracy']:.4f}")
print(f"Avg inference time per sample: {binary_results['avg_inference_time_us']:.2f} µs")
print(f"Model size: {binary_results['model_size_bytes'] / 1024:.2f} KB")

X_multi, y_multi = load_iris(return_X_y=True)[:1000]

Xm_train, Xm_test, ym_train, ym_test = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

rf_multiclass = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf_multiclass.fit(Xm_train, ym_train)

multi_results = benchmark_model(
    rf_multiclass, Xm_test, ym_test, "rf_multiclass"
)

print("\n=== Multiclass Classification (Iris) ===")
print(f"Accuracy: {multi_results['accuracy']:.4f}")
print(f"Avg inference time per sample: {multi_results['avg_inference_time_us']:.2f} µs")
print(f"Model size: {multi_results['model_size_bytes'] / 1024:.2f} KB")

In [None]:
def subsample_forest(rf_model, n_trees):
    assert n_trees <= len(rf_model.estimators_), "n_trees exceeds forest size"

    new_model = copy.deepcopy(rf_model)
    new_model.estimators_ = new_model.estimators_[:n_trees]
    new_model.n_estimators = n_trees

    return new_model

In [None]:
def benchmark_rf(model, X_test, y_test, name):
    acc = accuracy_score(y_test, model.predict(X_test))

    N_RUNS = 100
    start = time.perf_counter()
    for _ in range(N_RUNS):
        model.predict(X_test)
    end = time.perf_counter()

    avg_us = (end - start) / (N_RUNS * len(X_test)) * 1e6

    filename = f"{name}.joblib"
    joblib.dump(model, filename)
    size_kb = os.path.getsize(filename) / 1024

    return acc, avg_us, size_kb

In [None]:
tree_counts = [200, 150, 100, 50, 25]

print("Trees | Accuracy | Inference (µs/sample) | Size (KB)")
print("-" * 50)

for n in tree_counts:
    sub_rf = subsample_forest(rf_binary, n)
    acc, speed, size = benchmark_rf(
        sub_rf, Xb_test, yb_test, f"rf_binary_{n}"
    )

    print(f"{n:5d} | {acc:.4f}   | {speed:8.2f}              | {size:7.1f}")

In [None]:
tree_counts = [200, 150, 100, 50, 25, 1]

print("Trees | Accuracy | Inference (µs/sample) | Size (KB)")
print("-" * 55)

for n in tree_counts:
    sub_rf = subsample_forest(rf_multiclass, n)

    acc, speed, size = benchmark_rf(
        sub_rf, Xm_test, ym_test, f"rf_multiclass_{n}"
    )

    print(f"{n:5d} | {acc:.4f}   | {speed:10.2f}         | {size:7.1f}")