In [5]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [6]:
BASE_DIR = "../../evaluation/output_evals/libristutter"
DIRECTION_PAIRS = ['en_de',
                    'en_es','en_fr',
                    'en_it','en_nl',
                    'en_pt', 'en_zh']

SYSTEM_NAMES = ['canary-v2', 
                'desta2-8b', 
                'gemma_canary-v2', 
                'gemma_seamlessm4t', 
                'gemma_whisper', 
                'owsm4.0-ctc', 
                'phi4multimodal',
                'qwen2audio-7b',
                'seamlessm4t',
                'voxtral-small-24b'
               ]

In [7]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [8]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [9]:
results = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)



In [10]:
df = convert_results_to_dataframe(results)

In [11]:
df

Unnamed: 0,direction,system,XCOMET-QE,QEMetricX_24,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy
0,en_de,canary-v2,0.6913,6.7141,0.2110,6.7167,0.6911
1,en_de,desta2-8b,0.6820,6.8960,0.2110,6.9362,0.6816
2,en_de,gemma_canary-v2,0.7060,6.0532,0.2110,6.0616,0.7059
3,en_de,gemma_whisper,0.7591,5.1795,0.0000,5.1795,0.7591
4,en_de,owsm4.0-ctc,0.5564,11.9300,0.0000,11.9300,0.5564
...,...,...,...,...,...,...,...
59,en_zh,owsm4.0-ctc,0.3785,10.4948,0.0000,10.4948,0.3785
60,en_zh,phi4multimodal,0.5126,7.4305,15.4008,9.6437,0.4191
61,en_zh,qwen2audio-7b,0.4861,6.3431,1.4768,6.5145,0.4818
62,en_zh,seamlessm4t,0.3340,13.1322,3.7975,13.3919,0.3236


In [19]:
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df.to_csv(f"libristutter_{pair}.csv",index=False)

In [20]:
sub_df

Unnamed: 0,direction,system,XCOMET-QE,QEMetricX_24,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy
54,en_zh,canary-v2,0.6365,9.7694,100.0,25.0,0.0
55,en_zh,desta2-8b,0.4061,9.1766,26.1603,11.6898,0.3531
56,en_zh,gemma_canary-v2,0.55,5.2605,0.4219,5.3206,0.5493
57,en_zh,gemma_seamlessm4t,0.5282,5.584,0.211,5.5943,0.5279
58,en_zh,gemma_whisper,0.5847,4.9168,0.0,4.9168,0.5847
59,en_zh,owsm4.0-ctc,0.3785,10.4948,0.0,10.4948,0.3785
60,en_zh,phi4multimodal,0.5126,7.4305,15.4008,9.6437,0.4191
61,en_zh,qwen2audio-7b,0.4861,6.3431,1.4768,6.5145,0.4818
62,en_zh,seamlessm4t,0.334,13.1322,3.7975,13.3919,0.3236
63,en_zh,voxtral-small-24b,0.6147,4.7396,0.0,4.7396,0.6147


In [21]:
MANIFEST_PATH = '../../manifests/libristutter/en-de.jsonl'

In [25]:
import json

def get_disfluence_ids(manifest):
    ids = []
    with open(manifest) as m:
        for line in m.readlines():
            utt = json.loads(line)
            if utt['benchmark_metadata']['has_stutter'] == 'True':
                ids.append("sample_id")
    return ids
            

In [26]:
ids = get_disfluence_ids(MANIFEST_PATH)

In [27]:
len(ids)

351