# Transcription LLM Evaluation Suite

This notebook demonstrates how to evaluate different LLM models for audio transcription tasks using the simplified MES framework.

## Setup and Configuration

In [None]:
import sys
import os
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src directory to path
sys.path.append('../src')

from orchestrator.experiment_runner import ExperimentRunner

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Transcription LLM Evaluation Suite initialised successfully!")

## Initialise Experiment Runner

Load the configuration and initialize the experiment runner.

In [None]:
# Initialise the experiment runner with configuration
config_path = '../config/sample_experiments.yaml'
runner = ExperimentRunner(config_path)

print(f"Available metrics: {list(runner.available_metrics.keys())}")
print(f"Configured experiments: {[exp['name'] for exp in runner.config['experiments']]}")

## Run Transcription Experiments

Execute the configured transcription experiments on the audio dataset.

In [None]:
# Run all transcription experiments (this may take a while)
# You can also specify specific experiments: runner.run_experiments(['transcription_baseline'])

print("Starting transcription experiment run...")
results_df = runner.run_experiments()

print(f"\nTranscription experiment run completed!")
print(f"Total results: {len(results_df)}")
print(f"Shape: {results_df.shape}")

## Transcription Experiment Summary

View high-level statistics about the transcription experiment run.

In [None]:
# Get transcription experiment summary
summary = runner.get_experiment_summary()

print("Transcription Experiment Summary:")
print("="*50)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key}: {value:.3f}")
    else:
        print(f"{key}: {value}")

## Transcription Results Exploration

Explore the transcription results dataset.

In [None]:
# Display basic information about the transcription results
print("Transcription Results DataFrame Info:")
print(results_df.info())

print("\nFirst few rows:")
display(results_df.head())

print("\nExperiment breakdown:")
print(results_df['experiment_name'].value_counts())

print("\nModel performance comparison:")
print(results_df['model_id'].value_counts())

In [None]:
# Check for any errors in processing
error_mask = results_df['experiment_name'].str.contains('error', na=False)
if error_mask.any():
    print(f"Found {error_mask.sum()} results with errors:")
    display(results_df[error_mask][['experiment_name', 'audio_file', 'error']])
else:
    print("No processing errors found!")

# Remove error rows for analysis
clean_results = results_df[~error_mask].copy()
print(f"\nClean transcription results: {len(clean_results)} rows")

## Transcription Performance Analysis

Analyze transcription model performance across different metrics.

In [None]:
# Identify transcription metric columns
metric_columns = [col for col in clean_results.columns 
                 if col.startswith(('transcript_', 'safety_'))]

print(f"Identified {len(metric_columns)} transcription metric columns:")
for col in metric_columns:
    print(f"  {col}")

In [None]:
# Transcription model performance comparison
if len(clean_results) > 0:
    # Group by experiment and calculate mean metrics
    metric_summary = clean_results.groupby(['experiment_name', 'model_id'])[metric_columns].mean()
    
    print("Transcription Metric Summary by Experiment:")
    display(metric_summary.round(3))
    
    # Show key transcription metrics
    key_metrics = [col for col in metric_columns if any(x in col for x in ['confidence', 'format_compliance', 'safety_overall'])]
    if key_metrics:
        print("\nKey Transcription Metrics:")
        display(clean_results.groupby('experiment_name')[key_metrics].mean().round(3))

## Transcription Performance Visualizations

Create visualizations to compare transcription model performance.

In [None]:
# Processing time comparison for transcription models
if 'processing_time' in clean_results.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Processing time by experiment
    sns.boxplot(data=clean_results, x='experiment_name', y='processing_time', ax=axes[0])
    axes[0].set_title('Transcription Processing Time by Experiment')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Processing time by model
    sns.boxplot(data=clean_results, x='model_id', y='processing_time', ax=axes[1])
    axes[1].set_title('Transcription Processing Time by Model')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Token usage analysis for transcription
token_columns = [col for col in clean_results.columns if 'token' in col.lower()]

if token_columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, col in enumerate(token_columns[:4]):
        if i < len(axes):
            sns.barplot(data=clean_results, x='experiment_name', y=col, ax=axes[i])
            axes[i].set_title(f'Transcription {col} by Experiment')
            axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No token usage data available for analysis.")

In [None]:
# Transcription quality and safety metrics heatmap
safety_metrics = [col for col in metric_columns if 'safety' in col]
transcript_metrics = [col for col in metric_columns if 'transcript' in col]

if safety_metrics:
    # Safety metrics heatmap
    safety_data = clean_results.groupby('experiment_name')[safety_metrics].mean()
    
    plt.figure(figsize=(12, 6))
    sns.heatmap(safety_data.T, annot=True, cmap='RdYlBu_r', center=0.5)
    plt.title('Safety Metrics by Transcription Experiment')
    plt.ylabel('Safety Metrics')
    plt.xlabel('Experiment')
    plt.tight_layout()
    plt.show()

if transcript_metrics:
    # Transcript quality metrics heatmap
    quality_data = clean_results.groupby('experiment_name')[transcript_metrics].mean()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(quality_data.T, annot=True, cmap='RdYlGn', center=0.5)
    plt.title('Transcript Quality Metrics by Experiment')
    plt.ylabel('Transcript Quality Metrics')
    plt.xlabel('Experiment')
    plt.tight_layout()
    plt.show()

## Interactive Transcription Visualizations

Create interactive plots for transcription performance analysis.

In [None]:
# Interactive scatter plot of transcription metrics
if len(clean_results) > 0:
    # Find transcription-specific metrics to plot
    x_metric = None
    y_metric = None
    
    # Look for transcript confidence and safety metrics
    for col in metric_columns:
        if 'confidence' in col and x_metric is None:
            x_metric = col
        elif 'safety' in col and 'overall' in col and y_metric is None:
            y_metric = col
    
    if x_metric and y_metric:
        import plotly.express as px
        fig = px.scatter(
            clean_results, 
            x=x_metric, 
            y=y_metric,
            color='experiment_name',
            size='processing_time',
            hover_data=['model_id', 'audio_file'],
            title=f'Transcription: {x_metric} vs {y_metric}'
        )
        fig.show()
    else:
        print("Suitable transcription metrics for scatter plot not found")
        print(f"Available metrics: {metric_columns}")

In [None]:
# Transcription performance radar chart
if len(metric_columns) >= 3:
    # Select key transcription metrics for radar chart
    selected_metrics = metric_columns[:5]
    
    # Calculate mean scores by experiment
    radar_data = clean_results.groupby('experiment_name')[selected_metrics].mean()
    
    import plotly.graph_objects as go
    fig = go.Figure()
    
    for experiment in radar_data.index:
        fig.add_trace(go.Scatterpolar(
            r=radar_data.loc[experiment].values,
            theta=selected_metrics,
            fill='toself',
            name=experiment
        ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )
        ),
        showlegend=True,
        title="Transcription Model Performance Comparison (Radar Chart)"
    )
    
    fig.show()
else:
    print(f"Need at least 3 metrics for radar chart. Found {len(metric_columns)}: {metric_columns}")

## Detailed Transcription Analysis

Perform detailed analysis on transcription quality metrics.

In [None]:
# Transcription performance analysis
transcription_results = clean_results[clean_results['use_case'] == 'transcription']

print(f"Transcription experiments: {len(transcription_results)}")

if len(transcription_results) > 0:
    trans_metrics = [col for col in metric_columns if 'transcript' in col]
    if trans_metrics:
        print("\nTranscription Quality Metrics Summary:")
        display(transcription_results.groupby('experiment_name')[trans_metrics].mean().round(3))
    
    safety_metrics = [col for col in metric_columns if 'safety' in col]
    if safety_metrics:
        print("\nSafety Metrics Summary:")
        display(transcription_results.groupby('experiment_name')[safety_metrics].mean().round(3))
    
    # Model comparison
    print("\nModel Comparison:")
    model_comparison = transcription_results.groupby('model_id')[metric_columns].mean()
    display(model_comparison.round(3))
else:
    print("No transcription results found.")

In [None]:
# Statistical significance testing for transcription experiments
from scipy.stats import ttest_ind

if len(clean_results['experiment_name'].unique()) >= 2:
    experiments = clean_results['experiment_name'].unique()[:2]  # Compare first two experiments
    
    exp1_data = clean_results[clean_results['experiment_name'] == experiments[0]]
    exp2_data = clean_results[clean_results['experiment_name'] == experiments[1]]
    
    print(f"Statistical comparison: {experiments[0]} vs {experiments[1]}")
    print("="*60)
    
    # Test transcription-specific metrics
    test_metrics = metric_columns[:5]  # Test first 5 metrics
    
    for metric in test_metrics:
        if metric in exp1_data.columns and metric in exp2_data.columns:
            # Remove NaN values
            exp1_values = exp1_data[metric].dropna()
            exp2_values = exp2_data[metric].dropna()
            
            if len(exp1_values) > 1 and len(exp2_values) > 1:
                statistic, p_value = ttest_ind(exp1_values, exp2_values)
                
                significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
                
                print(f"{metric:30} | t={statistic:6.3f} | p={p_value:6.3f} {significance}")
                print(f"{'':30} | Mean1={exp1_values.mean():6.3f} | Mean2={exp2_values.mean():6.3f}")
                print("-" * 60)
else:
    print("Need at least 2 experiments for statistical comparison.")

## Export Transcription Results

Export transcription results for further analysis or reporting.

In [None]:
# Create transcription summary report
report_data = {
    'transcription_experiment_summary': runner.get_experiment_summary(),
    'transcription_metric_averages': clean_results.groupby('experiment_name')[metric_columns].mean().to_dict(),
    'processing_stats': {
        'total_files_processed': len(clean_results),
        'avg_processing_time': clean_results['processing_time'].mean(),
        'total_processing_time': clean_results['processing_time'].sum()
    },
    'model_comparison': clean_results.groupby('model_id')[metric_columns].mean().to_dict()
}

# Save to local file
import json
with open('transcription_experiment_report.json', 'w') as f:
    json.dump(report_data, f, indent=2, default=str)

print("Transcription experiment report saved to 'transcription_experiment_report.json'")

# Also save the full results as CSV
clean_results.to_csv('transcription_detailed_results.csv', index=False)
print("Detailed transcription results saved to 'transcription_detailed_results.csv'")

## Transcription Experiment Conclusions

Summarize key findings from the transcription experiments.

In [None]:
print("Transcription Experiment Conclusions:")
print("="*50)

if len(clean_results) > 0:
    # Best performing experiment by average metric score
    if metric_columns:
        avg_scores = clean_results.groupby('experiment_name')[metric_columns].mean().mean(axis=1)
        best_experiment = avg_scores.idxmax()
        print(f"Best overall transcription performance: {best_experiment} (avg score: {avg_scores[best_experiment]:.3f})")
    
    # Best model comparison
    if len(clean_results['model_id'].unique()) > 1:
        model_scores = clean_results.groupby('model_id')[metric_columns].mean().mean(axis=1)
        best_model = model_scores.idxmax()
        print(f"Best performing model: {best_model} (avg score: {model_scores[best_model]:.3f})")
    
    # Fastest experiment
    if 'processing_time' in clean_results.columns:
        avg_times = clean_results.groupby('experiment_name')['processing_time'].mean()
        fastest_experiment = avg_times.idxmin()
        print(f"Fastest transcription processing: {fastest_experiment} (avg time: {avg_times[fastest_experiment]:.2f}s)")
    
    # Safety analysis
    safety_cols = [col for col in metric_columns if 'safety' in col and 'overall' in col]
    if safety_cols:
        safety_scores = clean_results.groupby('experiment_name')[safety_cols].mean().mean(axis=1)
        safest_experiment = safety_scores.idxmin()  # Lower safety scores are better
        print(f"Safest transcription content: {safest_experiment} (avg safety score: {safety_scores[safest_experiment]:.3f})")
    
    # Transcript quality analysis
    quality_cols = [col for col in metric_columns if 'transcript_confidence' in col]
    if quality_cols:
        quality_scores = clean_results.groupby('experiment_name')[quality_cols].mean().mean(axis=1)
        best_quality = quality_scores.idxmax()
        print(f"Highest transcript quality: {best_quality} (avg confidence: {quality_scores[best_quality]:.3f})")
    
    print(f"\nTranscription experiments completed: {len(clean_results['experiment_name'].unique())}")
    print(f"Total audio files processed: {len(clean_results['audio_file'].unique())}")
    print(f"Total processing time: {clean_results['processing_time'].sum():.2f} seconds")
else:
    print("No transcription results to analyze.")

print("\nTranscription experiment run completed successfully!")