# Call Summarization Experiment

This notebook runs the Call Summarization Experiment using multiple LLM configurations with advanced evaluation metrics including ROUGE and LLM-as-a-Judge.

In [None]:
import os
import sys
import pandas as pd

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from agents.call_summarization_agent import CallSummarizationAgent
from orchestration.experiment_runner import ExperimentRunner
from orchestration.bigquery_writer import BigQueryWriter
from core.gcp_client import (
    get_gcs_audio_bucket_name,
    get_gcs_audio_dataset_paths,
    get_bq_summarization_table_id,
    config as main_global_config
)

def run_summarization_experiment():
    print("--- Starting Call Summarization Experiment (Multi-LLM with Advanced Metrics) ---")

    # 1. Initialize Experiment Runner with Advanced Metrics
    runner = ExperimentRunner(
        agent_class=CallSummarizationAgent,
        use_advanced_metrics=True,  # Enable ROUGE and LLM-as-a-Judge
        judge_llm_config_name="gemini_1_5_flash_default",  # Use different model for judging
        include_llm_judge=True,
        include_rouge=True
    )
    print("Initialized ExperimentRunner with advanced metrics (ROUGE + LLM-as-a-Judge).")

    # 2. Define Experiment Parameters
    gcs_bucket = get_gcs_audio_bucket_name()
    dataset_paths = get_gcs_audio_dataset_paths()
    dataset_key = 'sample_summarization_set'
    audio_folder_path = dataset_paths.get(dataset_key)

    if not audio_folder_path or 'your-gcs-bucket' in gcs_bucket or 'path/to' in audio_folder_path:
        print(f"ERROR: Please configure GCS bucket and '{dataset_key}' path in configs/main_config.yaml.")
        return
    print(f"Targeting GCS: gs://{gcs_bucket}/{audio_folder_path}")

    # 3. Specify LLM configurations to test (names from main_config.yaml)
    all_defined_llm_configs = list(main_global_config['gcp']['vertex_ai']['llm_configurations'].keys())
    if not all_defined_llm_configs:
        print("ERROR: No LLM configurations found in 'configs/main_config.yaml' under 'vertex_ai.llm_configurations'.")
        return
        
    # Test specific configurations with different parameters
    llm_configs_to_test = ["gemini_1_5_flash_default", "gemini_1_0_pro_concise"]
    llm_configs_to_test = [name for name in llm_configs_to_test if name in all_defined_llm_configs]
    if not llm_configs_to_test:
        print(f"ERROR: None of the specified llm_configs_to_test were found in main_config.yaml. Available: {all_defined_llm_configs}")
        return

    print(f"LLM configurations to be tested: {llm_configs_to_test}")

    # 4. Optional: Global overrides for this experiment run
    global_llm_params_override = {
        "temperature": 0.15,
        # "max_output_tokens": 200
    }
    print(f"Global LLM parameter overrides: {global_llm_params_override}")

    global_prompt_override = None
    global_system_prompt_override = None

    # 5. Run the Experiment
    print(f"\nStarting experiment with advanced evaluation metrics...")
    results = runner.run_experiment(
        gcs_audio_folder_path=audio_folder_path,
        gcs_bucket_name=gcs_bucket,
        llm_config_names=llm_configs_to_test,
        global_llm_parameters_override=global_llm_params_override,
        global_prompt_override=global_prompt_override,
        global_system_prompt_override=global_system_prompt_override
    )

    if not results:
        print("Experiment finished but produced no results.")
        return
    print(f"\nExperiment completed. Generated {len(results)} result entries.")
    
    # Display sample results with new metrics
    results_df = pd.DataFrame(results)
    print("\nSample of new evaluation metrics:")
    metric_columns = [col for col in results_df.columns if 'rouge' in col or 'llm_judge' in col]
    if metric_columns:
        print(results_df[metric_columns[:10]].head(2).to_string())
    
    print("\nFirst 2 experiment result entries:")
    print(results_df.head(2)[['run_id', 'agent_type', 'llm_config_name_attempted']].to_string())

    # 6. Write Results to BigQuery
    try:
        bq_writer = BigQueryWriter()
        summarization_table_id = get_bq_summarization_table_id()
        print(f"\nAttempting to write {len(results)} results to BigQuery table: {bq_writer.dataset_id}.{summarization_table_id}")
        bq_writer.write_results(results, summarization_table_id)
        print("Successfully wrote results with advanced metrics to BigQuery.")
    except Exception as e:
        print(f"Error writing results to BigQuery: {e}")

    print("--- Call Summarization Experiment Finished ---")

# Run the experiment
run_summarization_experiment()