# Audio Transcription and Summarization

This notebook processes audio files from GCS, generates summaries and transcriptions using the Gemini API, and displays them side-by-side.

In [None]:
import os
import sys
import yaml
import pandas as pd
from IPython.display import display, HTML

# Add src directory to path
sys.path.append('../src')

from clients.gcs_client import GCSClient
from clients.gemini_client import GeminiClient
from utils.prompt_manager import PromptManager

In [None]:
# --- CONFIGURATION ---
CONFIG_PATH = '../config/sample_experiments.yaml'

# Load YAML config
try:
    with open(CONFIG_PATH, 'r') as f:
        config = yaml.safe_load(f)
    print("Configuration loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Configuration file not found at {CONFIG_PATH}")
    config = {}

project_id = config.get('project')
location = config.get('location')
bucket_name = config.get('bucket')

SUMMARY_PROMPT_ID = config.get('summary_prompt_id')
TRANSCRIPTION_PROMPT_ID = config.get('transcription_prompt_id')

MODEL_ID = config.get('model_id')
GENERATION_CONFIG = config.get('generation_config', {})
GENERATION_CONFIG = {
    "temperature": 0.2,
    "top_p": 1.0,
    "top_k": 32,
    "max_output_tokens": 8192,
}

In [None]:
# Initialise clients
gcs_client = GCSClient(bucket_name=bucket_name)
prompt_manager = PromptManager(project_id=project_id, location=location)
gemini_client = GeminiClient(model_id=MODEL_ID, config={'project_id': project_id, 'location': location})

print("Clients initialised.")

In [None]:
# Get audio file patterns from config

gcs_patterns = config.get('gcs_files', [])
audio_files = gcs_client.list_audio_files(gcs_patterns)

print(f"Found {len(audio_files)} audio files.")
display(pd.DataFrame(audio_files, columns=['Audio File URI']))

In [None]:
# Load prompts

try:
    summary_prompt = prompt_manager.load(SUMMARY_PROMPT_ID)
    transcription_prompt = prompt_manager.load(TRANSCRIPTION_PROMPT_ID)
    print("Successfully loaded prompts.")
    print("\n--- Summary Prompt ---")
    print(summary_prompt)
    print("\n--- Transcription Prompt ---")
    print(transcription_prompt)
except Exception as e:
    print(f"Error loading prompts: {e}")

In [None]:
results = []

def get_mime_type_from_path(path: str) -> str:
    """Get MIME type from file path"""
    extension = path.lower().split('.')[-1]
    mime_types = {
        'wav': 'audio/wav',
        'mp3': 'audio/mpeg',
        'm4a': 'audio/mp4'
    }
    return mime_types.get(extension, 'audio/wav')

for audio_file in audio_files:
    print(f"Processing {audio_file}...")
    try:
        _, path = gcs_client.parse_gcs_uri(audio_file)
        audio_data = gcs_client.download_bytes(path)
        mime_type = get_mime_type_from_path(path)

        # Generate summary
        summary_response = gemini_client.generate_from_audio(
            audio_data=audio_data,
            prompt=summary_prompt,
            generation_config=GENERATION_CONFIG,
            mime_type=mime_type
            )
        summary_text = summary_response['response_text']

        # Generate transcription
        transcription_response = gemini_client.generate_from_audio(
            audio_data=audio_data,
            prompt=transcription_prompt,
            generation_config=GENERATION_CONFIG,
            mime_type=mime_type
            )
        transcription_text = transcription_response['response_text']

        results.append({
            'audio_file': audio_file,
            'summary': summary_text,
            'transcription': transcription_text
            })
        print(f"Successfully processed {audio_file}")

    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        results.append({
            'audio_file': audio_file,
            'summary': f"Error: {e}",
            'transcription': f"Error: {e}"
        })

print("\nFinished processing all audio files.")

In [None]:
# Display results in a side-by-side format
if results:
    html = """
    <style>
        table { width: 100%; border-collapse: collapse; }
        th, td { border: 1px solid #ddd; padding: 8px; }
        th { background-color: #f2f2f2; }
        td { vertical-align: top; white-space: pre-wrap; }
    </style>
    <h1>Summarization and Transcription Results</h1>
    <table>
        <tr>
            <th>Audio File</th>
            <th>Summary</th>
            <th>Transcription</th>
        </tr>
    """

    for result in results:
        html += f"""
        <tr>
            <td>{result['audio_file']}</td>
            <td>{result['summary']}</td>
            <td>{result['transcription']}</td>
        </tr>
        """

    html += "</table>"

    display(HTML(html))
else:
    print("No results to display.")