<a href="https://colab.research.google.com/github/angelatyk/tinytutor/blob/dev/notebooks/02_audio_generation_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AudioWriter agent for converting scripts to speech

This agent reads the script generated by ScriptWriter Agent (e.g., script.txt) and converts it into high‚Äëquality audio using Google Cloud Text‚Äëto‚ÄëSpeech. It includes secure credential handling in Colab, chunking for long scripts, configurable voice settings, and saves an MP3 ready for playback or download.

## Install and import libraries

In [1]:
# Install Google Cloud Text-to-Speech client and helpers
!pip install -q google-cloud-texttospeech pydub

# System and Google Cloud imports
import os
import json
from pathlib import Path
from typing import List, Tuple

from google.colab import userdata
from google.cloud import texttospeech

# Audio handling (optional: combine segments safely)
from pydub import AudioSegment

# For quick playback in Colab
from IPython.display import Audio, display

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/192.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ[0m [32m184.3/192.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m192.2/192.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


## Credentials setup and configuration


In [2]:
# Load service account JSON securely from Colab secrets
SERVICE_ACCOUNT_JSON = userdata.get('GCP_VI_SERVICE_ACCOUNT_JSON')
if not SERVICE_ACCOUNT_JSON:
    raise RuntimeError("Secret 'GCP_VI_SERVICE_ACCOUNT_JSON' not found in Colab. Add your service account JSON to Colab secrets.")

# Persist credentials to a temporary file for the SDK
CRED_PATH = Path("gcp_tts_sa.json")
with open(CRED_PATH, "w") as f:
    f.write(SERVICE_ACCOUNT_JSON)

# Point the Google SDK to your credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(CRED_PATH)

# Optional: set your project ID manually if needed for quota scoping (not required for TTS client)
# os.environ["GOOGLE_CLOUD_PROJECT"] = "your-project-id"

# Initialize the Text-to-Speech client
tts_client = texttospeech.TextToSpeechClient()
print("‚úÖ Google Cloud TTS client initialized.")

‚úÖ Google Cloud TTS client initialized.


## Utility: robust text chunking for long scripts

In [3]:
def chunk_text(
    text: str,
    max_chars: int = 4500,
    prefer_breaks: Tuple[str, ...] = ("\n\n", "\n", ". ")
) -> List[str]:
    """
    Splits long text into chunks under max_chars, preferring natural boundaries.
    The TTS API supports large inputs but staying <~5k chars per request is safer.
    """
    text = text.strip()
    if len(text) <= max_chars:
        return [text]

    chunks = []
    remaining = text

    while len(remaining) > max_chars:
        # Find the best break point
        break_idx = None
        for sep in prefer_breaks:
            idx = remaining.rfind(sep, 0, max_chars)
            if idx != -1:
                break_idx = idx + len(sep)
                break
        if break_idx is None:
            # Fallback: hard split at max_chars
            break_idx = max_chars

        chunk = remaining[:break_idx].strip()
        chunks.append(chunk)
        remaining = remaining[break_idx:].strip()

    if remaining:
        chunks.append(remaining)

    return chunks

## Core synthesis function

In [4]:
def synthesize_speech_segment(
    client: texttospeech.TextToSpeechClient,
    text: str,
    language_code: str = "en-US",
    voice_name: str = "en-US-Neural2-C",
    speaking_rate: float = 1.0,
    pitch: float = 0.0,
    volume_gain_db: float = 0.0,
    audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.MP3
) -> bytes:
    """
    Synthesizes a single segment of text to speech using Google Cloud TTS.
    Returns raw audio bytes (MP3 by default).
    """

    synthesis_input = texttospeech.SynthesisInput(text=text)

    voice = texttospeech.VoiceSelectionParams(
        language_code=language_code,
        name=voice_name
    )

    audio_config = texttospeech.AudioConfig(
        audio_encoding=audio_encoding,
        speaking_rate=speaking_rate,
        pitch=pitch,
        volume_gain_db=volume_gain_db
    )

    response = client.synthesize_speech(
        input=synthesis_input,
        voice=voice,
        audio_config=audio_config
    )
    return response.audio_content

## High-level agent: read script, chunk, synthesize, and combine


In [5]:
def audio_writer_agent(
    script_path: str = "script.txt",
    output_mp3_path: str = "script_audio.mp3",
    language_code: str = "en-US",
    voice_name: str = "en-US-Journey-F",
    speaking_rate: float = 0.94,
    pitch: float = 0.0,
    volume_gain_db: float = 0.0
) -> str:
    """
    Reads the script from `script_path`, converts it to audio, and saves an MP3 to `output_mp3_path`.
    Returns the output file path.
    """

    # Load the script
    if not Path(script_path).exists():
        raise FileNotFoundError(f"Script file not found: {script_path}")

    with open(script_path, "r", encoding="utf-8") as f:
        script_text = f.read()

    # Chunk the script safely
    chunks = chunk_text(script_text)
    print(f"üß© Chunking: {len(chunks)} segment(s) prepared for TTS.")

    # Synthesize each chunk
    segment_files = []
    for i, chunk in enumerate(chunks, start=1):
        audio_bytes = synthesize_speech_segment(
            client=tts_client,
            text=chunk,
            language_code=language_code,
            voice_name=voice_name,
            speaking_rate=speaking_rate,
            pitch=pitch,
            volume_gain_db=volume_gain_db,
            audio_encoding=texttospeech.AudioEncoding.MP3
        )
        # Save each segment MP3 (for safety, keep per-segment files)
        seg_path = Path(f"segment_{i:02d}.mp3")
        with open(seg_path, "wb") as f:
            f.write(audio_bytes)
        segment_files.append(seg_path)
        print(f"üéôÔ∏è Generated segment {i}: {seg_path}")

    # Combine segments into one MP3 using pydub
    combined = AudioSegment.silent(duration=250)  # small lead-in
    for seg in segment_files:
        combined += AudioSegment.from_mp3(seg) + AudioSegment.silent(duration=200)  # short pause

    combined.export(output_mp3_path, format="mp3")
    print(f"‚úÖ Combined audio saved: {output_mp3_path}")

    return output_mp3_path

## Quick test run

In [8]:
# Ensure script.txt exists (generated by your ScriptWriter Agent)
# If you want to test quickly, write a tiny sample:
# with open("script.txt", "w") as f:
#     f.write("TITLE: The Curious Robot\n\nSCENE 1: A small robot learns about colors.\nNarrator: Today we explore red blocks!\nLEARNING CHECK 1: What color did we pick?\n")

out_path = audio_writer_agent(
    script_path="script.txt",
    output_mp3_path="script_audio.mp3",
    language_code="en-US",
    voice_name="en-US-Journey-F",  # try en-US-Neural2-D, en-GB-Neural2-A, pt-BR-Neural2-B, etc.
    speaking_rate=0.94,
    pitch=0.0,
    volume_gain_db=0.0
)

# Play inline in Colab
display(Audio(filename=out_path))

üß© Chunking: 1 segment(s) prepared for TTS.
üéôÔ∏è Generated segment 1: segment_01.mp3
‚úÖ Combined audio saved: script_audio.mp3
