# Hyperparameter Sweep Error Analysis

This notebook scans `logs/hparam_sweep/<dataset_subset>/` for training logs, extracts the best error metrics, and visualizes error-focused summaries per dataset/subset. Run it after each sweep to see how the new jobs performed.

In [91]:
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import re

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [92]:
PROJECT_ROOT = Path('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf')
LOG_ROOT = PROJECT_ROOT / 'logs' / 'hparam_sweep'
KNOWN_SUBSETS = ['full','floats_only','onehot_only','floats_plus_onehot_no_middle']
LOG_COMBOS = sorted(p for p in LOG_ROOT.iterdir() if p.is_dir())
LOG_COMBOS

[PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_sorted_floats_only'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_sorted_floats_plus_onehot_no_middle'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_sorted_full'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_sorted_intersect_floats_only'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_sorted_intersect_floats_plus_onehot_no_middle'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_sorted_onehot_only'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_training_floats_only'),
 PosixPath('/clusterfs/jgi/scratch/gentech/genome_analysis/brandonimstepf/logs/hparam_sweep/ncbi_training_floats_plus_

In [93]:
BEST_PATTERN = re.compile(r"Best:\s+ERR=\s+(?P<err>[0-9.]+)\s+WER=\s+(?P<wer>[0-9.]+)\s+FPR=\s+(?P<fpr>[0-9.]+)\s+FNR=\s+(?P<fnr>[0-9.]+)\s+CTF=\s+(?P<ctf>[0-9.\-]+)")
CMD_PATTERN = re.compile(r"([A-Za-z_]+)=([0-9.eE+-]+)")

PREFIXES = ("general_","mse_","legacy_","triage_","wbce_","tversky_")
SUFFIX_MAP = {"high":"*1.2","low":"*0.8"}

def split_combo(name: str) -> Tuple[str,str]:
    for subset in KNOWN_SUBSETS:
        suffix = f"_{subset}"
        if name.endswith(suffix):
            dataset = name[: -len(suffix)]
            if dataset.endswith('_'):
                dataset = dataset[:-1]
            return dataset, subset
    return name, 'unknown'


def parse_command(line: str) -> Dict[str,str]:
    return {k:v for k,v in CMD_PATTERN.findall(line)}


def extract_param_value(label: str, cmd_map: Dict[str,str]) -> Tuple[str,str]:
    parts = label.split('_')
    prefix = parts[0]
    suffix = parts[-1]
    core = '_'.join(parts[1:-1]) if len(parts) > 2 else (parts[1] if len(parts) >1 else parts[0])
    if prefix == 'triage' or core == 'triage':
        pval = cmd_map.get('ptriage')
        nval = cmd_map.get('ntriage')
        val = f"ptriage={pval},ntriage={nval}" if pval and nval else suffix
        return 'triage', val
    # remove prefix indicators from core
    for pre in ('general_','mse_','legacy_','wbce_','tversky_'):
        if core.startswith(pre):
            core = core[len(pre):]
    core = core or parts[1] if len(parts)>1 else parts[0]
    actual = cmd_map.get(core)
    if actual:
        return core, actual
    # fallback to suffix mapping
    return core, SUFFIX_MAP.get(suffix, suffix)


def parse_log(path: Path) -> Dict:
    text = path.read_text(errors='replace')
    lines = text.splitlines()
    cmd_map = parse_command(lines[0]) if lines else {}
    match = BEST_PATTERN.search(text)
    status = 'SUCCESS' if match else 'FAIL'
    metrics = {k: float(v) for k, v in match.groupdict().items()} if match else {}
    dataset, subset = split_combo(path.parent.name)
    param_name, param_value = extract_param_value(path.stem, cmd_map)
    return {
        'combo': path.parent.name,
        'dataset': dataset,
        'subset': subset,
        'parameter': path.stem,
        'parameter_name': param_name,
        'parameter_value': param_value,
        'status': status,
        **metrics,
    }


def collect_records(dirs: Iterable[Path]) -> pd.DataFrame:
    records: List[Dict] = []
    for combo_dir in dirs:
        for log_path in combo_dir.glob('*.log'):
            records.append(parse_log(log_path))
    return pd.DataFrame(records)


df = collect_records(LOG_COMBOS)
df.head()

Unnamed: 0,combo,dataset,subset,parameter,parameter_name,parameter_value,status,err,wer,fpr,fnr,ctf
0,ncbi_sorted_floats_only,ncbi_sorted,floats_only,activation_msig_high,msig,0.024,FAIL,,,,,
1,ncbi_sorted_floats_only,ncbi_sorted,floats_only,activation_msig_low,msig,0.016,FAIL,,,,,
2,ncbi_sorted_floats_only,ncbi_sorted,floats_only,activation_rslog_high,rslog,0.024,FAIL,,,,,
3,ncbi_sorted_floats_only,ncbi_sorted,floats_only,activation_rslog_low,rslog,0.016,FAIL,,,,,
4,ncbi_sorted_floats_only,ncbi_sorted,floats_only,activation_sig_high,sig,0.72,FAIL,,,,,


In [94]:
if df.empty:
    print('No logs found yet.')
else:
    subset_summary = (
        df.groupby(['dataset','subset','status'])
          .size()
          .unstack(fill_value=0)
          .assign(total=lambda x: x.sum(axis=1))
          .sort_values('total', ascending=False)
    )
    subset_summary

In [95]:
# List dataset/subset combos with at least one successful run
if df.empty:
    print('No logs available.')
else:
    success_sets = (
        df[df['status']=='SUCCESS']
        .groupby(['dataset','subset'])
        .size()
        .reset_index(name='success_count')
        .sort_values('success_count', ascending=False)
    )
    if success_sets.empty:
        print('No successful runs yet.')
    else:
        print(f"Total successful dataset/subset combos: {len(success_sets)}")
        success_sets


Total successful dataset/subset combos: 2


In [96]:
if df.empty:
    print('No data to summarize.')
else:
    success_df = df[df['status']=='SUCCESS']
    if success_df.empty:
        print('No successful runs yet.')
    else:
        err_summary = (
            success_df.groupby(['dataset','subset'])['err']
                      .agg(['min','median','max','count'])
                      .sort_values('median')
        )
        err_summary

In [97]:
if df.empty:
    print('No logs found to export.')
else:
    success_df = df[df['status']=='SUCCESS'].copy()
    if success_df.empty:
        print('No successful runs to export.')
    else:
        success_df[['parameter_base','param_suffix']] = success_df['parameter'].str.rsplit('_', n=1, expand=True)
        success_df['parameter_name'] = success_df['parameter_base'].str.replace('^general_|^mse_|^legacy_|^triage_', '', regex=True)
        success_df['parameter_value'] = success_df['param_suffix'].map({'high':'*1.2','low':'*0.8'}).fillna(success_df['param_suffix'])
        export_df = (success_df.groupby(['dataset','subset','parameter_name','parameter_value'])['err']
                     .min().reset_index())
        export_path = PROJECT_ROOT / 'logs' / 'hparam_sweep' / 'hparam_error_summary.csv'
        export_df.to_csv(export_path, index=False)
        export_path

In [98]:
# Highlight the top-performing parameter for each subset
if df.empty:
    print('No data to highlight.')
else:
    success_df = df[df['status']=='SUCCESS']
    if success_df.empty:
        print('No successful runs to highlight.')
    else:
        top_params = (
            success_df.sort_values('err')
            .groupby(['dataset','subset'])
            .first()[['parameter','err']]
        )
        top_params

In [99]:
if df.empty:
    print('No logs found to export.')
else:
    success_df = df[df['status']=='SUCCESS']
    if success_df.empty:
        print('No successful runs to export.')
    else:
        export_df = success_df.groupby(['dataset','subset','parameter'])['err'].min().reset_index()
        export_path = PROJECT_ROOT / 'logs' / 'hparam_sweep' / 'hparam_error_summary.csv'
        export_df.to_csv(export_path, index=False)
        export_path

In [100]:
if df.empty:
    print('No logs yet.')
else:
    success_df = df[df['status']=='SUCCESS'].copy()
    if success_df.empty:
        print('No successful runs yet.')
    else:
        success_df[['parameter_base','param_suffix']] = success_df['parameter'].str.rsplit('_', n=1, expand=True)
        success_df['parameter_name'] = success_df['parameter_base'].str.replace('^general_|^mse_|^legacy_|^triage_','',regex=True)
        success_df['parameter_value'] = success_df['param_suffix'].map({'high':'*1.2','low':'*0.8'}).fillna(success_df['param_suffix'])
        best_table = (success_df.sort_values('err')
                      .groupby(['dataset','subset','parameter_name','parameter_value'])[['err','wer','fpr','fnr','ctf']]
                      .first()
                      .reset_index())
        best_table.head(50)