In [None]:
from utils.pkl_preprocessor import PickleBatchLoader
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pandas as pd

# import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import time
from collections import defaultdict
from utils.cnn import (
    train_cnn_with_batch_loader,
    clear_gpu_memory,
)


In [None]:
def train_cnn_incrementally(batch_loader):
    """
    Function to train CNN on batches from a loader.
    This integrates CNN training into your existing comparison framework.
    """
    print("--- Starting CNN Training ---")

    # Train CNN with the same batch loader
    accuracy = train_cnn_with_batch_loader(
        batch_loader,
        epochs=30,  # Reduced epochs for faster comparison
        batch_size=128,  # Smaller batch size for memory efficiency
        patience=8,  # Early stopping patience
        learning_rate=0.001,
    )

    # Clear GPU memory after training
    clear_gpu_memory()

    return accuracy


In [None]:
def train_xgboost_incrementally(batch_loader):
    """
    Function to train XGBoost incrementally on batches from a loader.
    This version corrects the incremental training logic.
    """
    # 1. Instantiate the classifier ONCE before the loop.
    model = xgb.XGBClassifier(
        objective="binary:logistic",
        max_depth=6,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42,
    )

    # This will hold the trained Booster object from the previous iteration.
    trained_booster = None

    print("--- Starting Incremental XGBoost Training ---")

    # 2. Loop through batches
    for i, (X_batch, y_batch) in enumerate(batch_loader.batch_generator()):
        print(f"  Training XGBoost on batch {i + 1}/{len(batch_loader)}...")

        # For the first batch, trained_booster is None.
        # For subsequent batches, it's the model from the last step.
        model.fit(X_batch, y_batch, xgb_model=trained_booster)

        # 3. Get the underlying booster to pass to the next iteration
        trained_booster = model.get_booster()

    # 4. Evaluation on the hold-out test set
    print("  Evaluating final XGBoost model...")
    X_test, y_test = batch_loader.get_test_set()
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    return accuracy * 100


In [None]:
def train_random_forest_incrementally(batch_loader):
    """
    Function to train Random Forest using warm_start on batches.
    """
    # warm_start=True is key for incremental additions
    rf_model = RandomForestClassifier(n_estimators=5, random_state=42, warm_start=True)
    print("--- Starting Incremental Random Forest Training ---")

    # Training loop
    for i, (X_batch, y_batch) in enumerate(batch_loader.batch_generator()):
        print(f"  Training Random Forest on batch {i + 1}/{len(batch_loader)}...")
        rf_model.fit(X_batch, y_batch)
        # Increase the number of estimators for the next batch
        rf_model.n_estimators += 5

    # Evaluation on the hold-out test set
    print("  Evaluating final Random Forest model...")
    X_test, y_test = batch_loader.get_test_set()
    predictions = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy * 100


In [None]:
def train_gradient_boosting_incrementally(batch_loader):
    """
    Function to train Gradient Boosting using warm_start on batches.
    """
    # warm_start=True is key for incremental additions
    gb_model = GradientBoostingClassifier(
        n_estimators=5, learning_rate=0.1, max_depth=6, random_state=42, warm_start=True
    )
    print("--- Starting Incremental Gradient Boosting Training ---")

    # Training loop
    for i, (X_batch, y_batch) in enumerate(batch_loader.batch_generator()):
        print(f"  Training Gradient Boosting on batch {i + 1}/{len(batch_loader)}...")
        gb_model.fit(X_batch, y_batch)
        # Increase the number of estimators for the next batch
        gb_model.n_estimators += 5

    # Evaluation on the hold-out test set
    print("  Evaluating final Gradient Boosting model...")
    X_test, y_test = batch_loader.get_test_set()
    predictions = gb_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy * 100


In [None]:
def time_algorithm(func, *args, **kwargs):
    """Utility function to time algorithm execution"""
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    execution_time = end_time - start_time
    return result, execution_time


# Configuration


In [None]:
BATCH_SIZE = 100096
TEST_ROUND = 8
TEST_DELTAS = [24, 40, 56]

print(f"\n{'=' * 60}")
print(f"Testing Round {TEST_ROUND} of HIGHT with Deltas: {TEST_DELTAS}")
print(f"{'=' * 60}\n")


In [None]:
results_data = {
    "delta": [],
    "xgboost_accuracy": [],
    "random_forest_accuracy": [],
    "gradient_boosting_accuracy": [],
    "cnn_accuracy": [],  # Add this line
    "xgboost_time": [],
    "random_forest_time": [],
    "gradient_boosting_time": [],
    "cnn_time": [],  # Add this line
}


In [None]:
for delta in TEST_DELTAS:
    pickle_file = f"dataset_pkl_round_{TEST_ROUND}/HIGHT_{TEST_ROUND}_round_delta-{delta}_combined.pkl"

    print(f"\n{'=' * 50}")
    print(f"Processing Delta {delta} from file: {pickle_file}")
    print(f"{'=' * 50}\n")

    try:
        batch_loader = PickleBatchLoader(pickle_file, batch_size=BATCH_SIZE)

        # Train and time each algorithm (including CNN)
        print("Training XGBoost...")
        xgboost_accuracy, xgboost_time = time_algorithm(
            train_xgboost_incrementally, batch_loader
        )

        print("Training Random Forest...")
        random_forest_accuracy, random_forest_time = time_algorithm(
            train_random_forest_incrementally, batch_loader
        )

        print("Training Gradient Boosting...")
        gradient_boosting_accuracy, gradient_boosting_time = time_algorithm(
            train_gradient_boosting_incrementally, batch_loader
        )

        # Add CNN training
        print("Training CNN...")
        cnn_accuracy, cnn_time = time_algorithm(train_cnn_incrementally, batch_loader)

        # Store results (including CNN)
        results_data["delta"].append(delta)
        results_data["xgboost_accuracy"].append(xgboost_accuracy)
        results_data["random_forest_accuracy"].append(random_forest_accuracy)
        results_data["gradient_boosting_accuracy"].append(gradient_boosting_accuracy)
        results_data["cnn_accuracy"].append(cnn_accuracy)  # Add this line
        results_data["xgboost_time"].append(xgboost_time)
        results_data["random_forest_time"].append(random_forest_time)
        results_data["gradient_boosting_time"].append(gradient_boosting_time)
        results_data["cnn_time"].append(cnn_time)  # Add this line

        print(f"\nDelta {delta} Results:")
        print(f"XGBoost - Accuracy: {xgboost_accuracy:.4f}%, Time: {xgboost_time:.2f}s")
        print(
            f"Random Forest - Accuracy: {random_forest_accuracy:.4f}%, Time: {random_forest_time:.2f}s"
        )
        print(
            f"Gradient Boosting - Accuracy: {gradient_boosting_accuracy:.4f}%, Time: {gradient_boosting_time:.2f}s"
        )
        print(
            f"CNN - Accuracy: {cnn_accuracy:.4f}%, Time: {cnn_time:.2f}s"
        )  # Add this line

    except FileNotFoundError:
        print(f"File not found: {pickle_file}. Skipping delta {delta}.")
        continue
    except Exception as e:
        print(f"An error occurred during delta {delta}: {e}")
        continue


In [None]:
results_df = pd.DataFrame(results_data)
results_df.set_index("delta", inplace=True)

print(f"\n{'=' * 60}")
print("Final Results DataFrame:")
print(f"{'=' * 60}")
print(results_df)


In [None]:
results_df.to_pickle(f"model_comparison_results_round_{TEST_ROUND}_with_CNN.pkl")
results_df.to_csv(f"model_comparison_results_round_{TEST_ROUND}_with_CNN.csv")
print(
    f"\nResults saved as 'model_comparison_results_round_{TEST_ROUND}_with_CNN.pkl' and .csv"
)


In [None]:
results_df = pd.read_pickle(f"model_comparison_results_round_{TEST_ROUND}_with_CNN.pkl")

In [None]:
avg_times = {
    "XGBoost": results_df["xgboost_time"].mean(),
    "Random Forest": results_df["random_forest_time"].mean(),
    "Gradient Boosting": results_df["gradient_boosting_time"].mean(),
    "CNN": results_df["cnn_time"].mean(),  # Add this line
}

print(f"\nAverage Training Times:")
for algo, avg_time in avg_times.items():
    print(f"{algo}: {avg_time:.2f}s")


In [None]:
plt.style.use("default")
sns.set_palette("husl")
sns.set_context("notebook", font_scale=1.2)


In [None]:
# Create visualizations
fig = plt.figure(figsize=(16, 8))


In [None]:
print(f"\n{'=' * 80}")
print("Detailed Comparison Summary")
print(f"{'=' * 80}")

comparison_summary = pd.DataFrame(
    {
        "Algorithm": ["XGBoost", "Random Forest", "Gradient Boosting", "CNN"],
        "Avg Time (s)": [
            avg_times["XGBoost"],
            avg_times["Random Forest"],
            avg_times["Gradient Boosting"],
            avg_times["CNN"],
        ],
        "Best Accuracy (%)": [
            max([results_df.loc[d, "xgboost_accuracy"] for d in TEST_DELTAS]),
            max([results_df.loc[d, "random_forest_accuracy"] for d in TEST_DELTAS]),
            max([results_df.loc[d, "gradient_boosting_accuracy"] for d in TEST_DELTAS]),
            max([results_df.loc[d, "cnn_accuracy"] for d in TEST_DELTAS]),
        ],
        "Worst Accuracy (%)": [
            min([results_df.loc[d, "xgboost_accuracy"] for d in TEST_DELTAS]),
            min([results_df.loc[d, "random_forest_accuracy"] for d in TEST_DELTAS]),
            min([results_df.loc[d, "gradient_boosting_accuracy"] for d in TEST_DELTAS]),
            min([results_df.loc[d, "cnn_accuracy"] for d in TEST_DELTAS]),
        ],
    }
)

print(comparison_summary.to_string(index=False))

# Update performance rankings to include CNN
best_accuracies = [
    ("XGBoost", max([results_df.loc[d, "xgboost_accuracy"] for d in TEST_DELTAS])),
    (
        "Random Forest",
        max([results_df.loc[d, "random_forest_accuracy"] for d in TEST_DELTAS]),
    ),
    (
        "Gradient Boosting",
        max([results_df.loc[d, "gradient_boosting_accuracy"] for d in TEST_DELTAS]),
    ),
    ("CNN", max([results_df.loc[d, "cnn_accuracy"] for d in TEST_DELTAS])),
]


In [None]:
# Performance ranking
print(f"\n{'=' * 60}")
print("Performance Rankings")
print(f"{'=' * 60}")

print("Fastest to Slowest (by average time):")
sorted_by_time = sorted(avg_times.items(), key=lambda x: x[1])
for i, (algo, time_val) in enumerate(sorted_by_time, 1):
    print(f"{i}. {algo}: {time_val:.2f}s")

print("\nBest to Worst (by best accuracy achieved):")
sorted_by_accuracy = sorted(best_accuracies, key=lambda x: x[1], reverse=True)
for i, (algo, acc) in enumerate(sorted_by_accuracy, 1):
    print(f"{i}. {algo}: {acc:.4f}%")
