# Model optimization results analysis

### This file handles the results of the model optimization results done in `run_experiments.ipynb`, and generates the table seen in the paper


In [22]:
from dotenv import load_dotenv
from pathlib import Path
import sys
import os

# Walk up until we find the project root (folder with the .env)
current_path = Path().resolve()
for parent in [current_path] + list(current_path.parents):
    if (parent / ".env").exists():
        load_dotenv(parent / ".env")
        project_root = os.getenv("PROJECT_ROOT")
        print(project_root)
        sys.path.append(project_root)     
        break


%load_ext autoreload
%autoreload 2

/Users/emmanuel/Documents/belugas/beluga-call-pipeline
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
from sklearn.metrics import f1_score
import numpy as np
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
from training.utils import calculate_detection_f1

pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)



In [24]:
from models.utils import aggregate_folds_testing_metrics


results_dir = "../results/model_optimization"  
runs = [d for d in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, d))]
runs

# Load cross validation results for each run
cross_val_results = {}
for run in runs:

    aggregate_folds_testing_metrics(f"{results_dir}/{run}")
    res_obj = {}
    results_path = os.path.join(results_dir, run, "cross_val_test_results.json")
    if os.path.exists(results_path):
        with open(results_path, 'r') as f:
            res_obj["test_results"] = json.load(f)

    if os.path.exists(os.path.join(results_dir, run, "test_predictions.csv")):
        res_obj["test_predictions_df"] = pd.read_csv(os.path.join(results_dir, run, "test_predictions.csv"))
        cross_val_results[run] = res_obj
    



In [25]:
def create_metrics_df(cross_val_results, round_to=3):
    # Define the metrics and categories we want to track
    metrics = ['f1', 'precision', 'recall', 'accuracy']
    categories = ['Labels_Average', 'ECHO', 'HFPC', "CC", 'Whistle']
    
    # Initialize list to store all metrics
    metrics_list = []
    for run, res_obj in cross_val_results.items():
        test_results = res_obj["test_results"]
        test_predictions_df = res_obj["test_predictions_df"]

        # Calculate detection F1
        detection_f1 = calculate_detection_f1(test_predictions_df)
        
        # Process main results
        main_metric_row = {
            'run': run,
            'test_type': 'fp32',
            'detection_f1': round(detection_f1["mean"], round_to),
            'detection_f1_std': round(detection_f1["std"], round_to),
            'model_size_mb': round(test_results['model_info']['model_size_mb'],2) if test_results['model_info'] and test_results['model_info']['model_size_mb'] is not None else None,
            'model_size_kb': round(test_results['model_info']['model_size_kb']) if test_results['model_info'] and test_results['model_info']['model_size_kb'] is not None else None,
        }
        
        # Add main results metrics
        for metric in metrics:
            for category in categories:
                base_name = f"{category}_{metric}" if category != 'Labels_Average' else metric
                main_metric_row[base_name] = round(test_results['main_results'][metric][category]['mean'], round_to)
                main_metric_row[f"{base_name}_std"] = round(test_results['main_results'][metric][category]['std'], round_to)
        
        metrics_list.append(main_metric_row)
        
        # Process other tests
        for test_name, test_metrics in test_results['other_tests'].items():
            other_metric_row = {
                'run': run,
                'test_type': test_name,
                'detection_f1': round(detection_f1["mean"], 3),  # Keep the same detection F1
                'detection_f1_std': round(detection_f1["std"], 3)
            }
            if test_name == "quantized":
                other_metric_row['model_size_mb'] = round(test_results['model_info']['quantized_model_size_mb'],2)
                other_metric_row['model_size_kb'] = round(test_results['model_info']['quantized_model_size_kb'])
            
            # Add other test metrics
            for metric in metrics:
                for category in categories:
                    base_name = f"{category}_{metric}" if category != 'Labels_Average' else metric
                    other_metric_row[base_name] = round(test_metrics[metric][category]['mean'], round_to)
                    other_metric_row[f"{base_name}_std"] = round(test_metrics[metric][category]['std'], round_to)
            
            metrics_list.append(other_metric_row)
    
    # Create DataFrame
    df = pd.DataFrame(metrics_list)
    
    # Reorder columns to have run and test_type first
    cols = df.columns.tolist()
    cols.remove('run')
    cols.remove('test_type')
    df = df[['run', 'test_type',] + cols]
    
    return df

metrics_df = create_metrics_df(cross_val_results, round_to=3)



In [26]:
metrics_df = metrics_df[metrics_df["run"].str.contains("qat|resnet")]

f1_columns = ['run', 'test_type', 'model_size_mb', 'model_size_kb']  +[col for col in metrics_df.columns if 'f1' in col.lower()]
accuracy_columns = ['run', 'test_type', 'model_size_mb', 'model_size_kb']  +[col for col in metrics_df.columns if 'accuracy' in col.lower()]

# Display metrics with only f1-related columns
metrics_df[f1_columns]

Unnamed: 0,run,test_type,model_size_mb,model_size_kb,detection_f1,detection_f1_std,f1,f1_std,ECHO_f1,ECHO_f1_std,HFPC_f1,HFPC_f1_std,CC_f1,CC_f1_std,Whistle_f1,Whistle_f1_std
0,resnet_hp_200_resize,fp32,,,0.965,0.003,0.924,0.005,0.928,0.008,0.902,0.015,0.917,0.008,0.949,0.004
9,mobile_net_quant_pretrained_qat_10_layers,fp32,2.96,2892.0,0.963,0.007,0.924,0.004,0.925,0.005,0.912,0.012,0.911,0.016,0.949,0.005
10,mobile_net_quant_pretrained_qat_10_layers,quantized,0.83,807.0,0.963,0.007,0.926,0.003,0.925,0.005,0.917,0.009,0.915,0.011,0.947,0.004
16,mobile_net_quant_pretrained_qat_6_layers,fp32,0.89,871.0,0.967,0.004,0.919,0.005,0.923,0.007,0.904,0.008,0.902,0.008,0.95,0.007
17,mobile_net_quant_pretrained_qat_6_layers,quantized,0.26,252.0,0.967,0.004,0.917,0.003,0.917,0.007,0.904,0.015,0.901,0.012,0.948,0.004
21,mobile_net_quant_pretrained_qat_8_layers,fp32,1.22,1188.0,0.965,0.003,0.926,0.006,0.927,0.004,0.92,0.011,0.908,0.012,0.951,0.003
22,mobile_net_quant_pretrained_qat_8_layers,quantized,0.35,343.0,0.965,0.003,0.927,0.006,0.925,0.004,0.922,0.01,0.907,0.009,0.953,0.006
23,mobile_net_quant_pretrained_qat_4_layers,fp32,0.31,304.0,0.945,0.005,0.865,0.008,0.894,0.012,0.84,0.016,0.801,0.018,0.925,0.002
24,mobile_net_quant_pretrained_qat_4_layers,quantized,0.09,90.0,0.945,0.005,0.86,0.011,0.889,0.017,0.833,0.012,0.792,0.026,0.925,0.004
27,mobile_net_quant_pretrained_qat,fp32,4.96,4847.0,0.962,0.008,0.926,0.006,0.922,0.006,0.921,0.016,0.911,0.011,0.948,0.004


In [None]:
metrics_df[~metrics_df['run'].str.contains('000', na=False)][f1_columns].sort_values('f1', ascending=False)
metrics_df[~metrics_df['run'].str.contains('000', na=False)][accuracy_columns].sort_values('accuracy', ascending=False)
