<a href="https://colab.research.google.com/github/arssite/GENAi-Assessment/blob/main/Dual_Speaker_Audio_Generation_Script_for_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# @title Step 1: Setup and Audio Generation
#
# This single cell handles everything: installation, setup, and processing.
# 1. Installs the required Google Cloud Text-to-Speech library.
# 2. Sets up the necessary functions to process your scripts.
# 3. Reads all .txt files from the /content/scripts/ folder.
# 4. Generates a dual-speaker MP3 for each script.
#
# ==> Just press the "Play" button on this cell to start.

import os
import re
from google.colab import files

In [39]:
# @title Step 1: Setup and Audio Generation with Gemini 2.5 Flash TTS
#
# This single cell handles everything: setup, parsing, and audio generation.
# 1. Sets up the necessary functions to process your scripts for the Gemini TTS API.
# 2. Reads all .txt files from the /content/scripts/ folder.
# 3. Generates a multi-speaker WAV audio file for each script using gemini-2.5-flash-preview-tts.
#
# ==> Just press the "Play" button on this cell to start.

import os
import re
import requests
import base64
import wave
import struct
from google.colab import files

# --- Configuration ---
# IMPORTANT: Replace the placeholder key with your actual Gemini API key.
api_key = "AIzaSyBvwzocmntpyrLburtWU_Vnz29hf4OBjIo" # <-- IMPORTANT: PASTE YOUR REAL API KEY HERE

# Define the voices for the characters using prebuilt Gemini TTS voices.
# We've chosen voices that have a clear Indian English accent.
VOICE_CONFIG = {
    "Dr. Science": "Kore",      # A firm, knowledgeable female-sounding voice for explaining.
    "Anmol": "Enceladus"      # A friendly, male-sounding voice for asking.
}

# Define the folder paths for input scripts and output audio
INPUT_SCRIPTS_FOLDER = "/content/scripts/"
OUTPUT_AUDIO_FOLDER = "/content/audio_output/"

# --- Main Setup ---
print("--- Gemini 2.5 Flash TTS Audio Generation ---")
print("\nCreating necessary folders...")
os.makedirs(INPUT_SCRIPTS_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_AUDIO_FOLDER, exist_ok=True)
print(f"Input folder: {INPUT_SCRIPTS_FOLDER}")
print(f"Output folder: {OUTPUT_AUDIO_FOLDER}")
print("\nIMPORTANT: Please upload your .txt script files to the 'scripts' folder on the left.")


# --- Core Functions ---

def parse_script_to_dialogue(file_path):
    """
    Reads a script file and formats it into a single string for the Gemini API.
    The API will use the speaker names (e.g., "Dr. Chen:") to assign voices.
    """
    dialogue_lines = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                stripped_line = line.strip()
                # We only care about lines that start with a character's name.
                if stripped_line.startswith("Dr. Chen:") or stripped_line.startswith("Alex:"):
                    # Remove bracketed notes like "[...]" from the dialogue
                    clean_line = re.sub(r'\[.*?\]', '', stripped_line).strip()
                    dialogue_lines.append(clean_line)
    except FileNotFoundError:
        print(f"Error: Script file not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred while parsing {file_path}: {e}")
        return None

    # Join all dialogue lines into a single string, separated by newlines.
    return "\n".join(dialogue_lines) if dialogue_lines else None


def pcm_to_wav(pcm_data, sample_rate, output_filename):
    """
    Converts raw PCM audio data (16-bit) into a standard WAV file.
    The Gemini TTS API returns audio in this raw format.
    """
    with wave.open(output_filename, 'wb') as wf:
        wf.setnchannels(1)  # Mono audio
        wf.setsampwidth(2)  # 16-bit (2 bytes) per sample
        wf.setframerate(sample_rate)
        wf.writeframes(pcm_data)


def generate_audio_from_dialogue(dialogue_text, output_filename):
    """
    Calls the Gemini 2.5 Flash TTS API to generate multi-speaker audio.
    """
    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={api_key}"

    # Construct the multi-speaker voice configuration from our settings
    speaker_configs = [
        {"speaker": name, "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice}}}
        for name, voice in VOICE_CONFIG.items()
    ]

    # This is the request payload for the API
    payload = {
        "contents": [{
            "parts": [{
                "text": f"TTS the following conversation:\n{dialogue_text}"
            }]
        }],
        "generationConfig": {
            "responseModalities": ["AUDIO"],
            "speechConfig": {
                "multiSpeakerVoiceConfig": {
                    "speakerVoiceConfigs": speaker_configs
                }
            }
        },
        "model": "gemini-2.5-flash-preview-tts"
    }

    try:
        print(f"  > Sending request to Gemini API for '{os.path.basename(output_filename)}'...")
        response = requests.post(api_url, json=payload, headers={'Content-Type': 'application/json'})
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        response_json = response.json()

        # Extract the audio data from the response
        part = response_json.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0]
        audio_data_base64 = part.get('inlineData', {}).get('data')
        mime_type = part.get('inlineData', {}).get('mimeType')

        if not audio_data_base64 or not mime_type or not mime_type.startswith("audio/"):
            print(f"  > ERROR: Could not find valid audio data in API response.")
            print(f"    Response: {response_json}")
            return False

        # Decode the base64 string to raw bytes
        pcm_data = base64.b64decode(audio_data_base64)

        # The sample rate is included in the mimeType, e.g., "audio/L16;rate=24000"
        sample_rate_match = re.search(r'rate=(\d+)', mime_type)
        if not sample_rate_match:
            print(f"  > ERROR: Could not determine sample rate from mimeType: {mime_type}")
            return False
        sample_rate = int(sample_rate_match.group(1))

        # Convert the raw PCM data to a playable WAV file
        pcm_to_wav(pcm_data, sample_rate, output_filename)
        print(f"  > Successfully saved audio to '{output_filename}'")
        return True

    except requests.exceptions.HTTPError as http_err:
        print(f"  > HTTP ERROR: {http_err}")
        print(f"    Response Body: {response.text}")
        print("    Please check if your API key is correct and has access to the Gemini API.")
        return False
    except Exception as e:
        print(f"  > An unexpected error occurred: {e}")
        return False


# --- Main Execution Logic ---

def process_all_scripts():
    """
    Main function to find all scripts and process them one by one.
    """
    print("\n--- Starting Audio Generation Process ---")
    script_files = [f for f in os.listdir(INPUT_SCRIPTS_FOLDER) if f.endswith('.txt')]

    if not script_files:
        print("\nNo .txt script files found in the 'scripts' folder.")
        print("Please upload your files and run this cell again.")
        return

    print(f"Found {len(script_files)} script(s) to process.\n")

    for script_filename in script_files:
        input_path = os.path.join(INPUT_SCRIPTS_FOLDER, script_filename)
        print(f"Processing script: {script_filename}")

        # 1. Parse the script to create the dialogue string
        dialogue = parse_script_to_dialogue(input_path)
        if not dialogue:
            print(f"  > Could not parse dialogue from {script_filename}. Skipping.\n")
            continue

        # 2. Define the output filename (as .wav)
        base_name = os.path.splitext(script_filename)[0]
        output_path = os.path.join(OUTPUT_AUDIO_FOLDER, f"{base_name}.wav")

        # 3. Generate the audio
        generate_audio_from_dialogue(dialogue, output_path)
        print("-" * 20)

    print("\n--- All scripts processed! ---")
    print(f"You can find your generated WAV files in the '{OUTPUT_AUDIO_FOLDER}' folder.")

# Run the main process
process_all_scripts()



--- Gemini 2.5 Flash TTS Audio Generation ---

Creating necessary folders...
Input folder: /content/scripts/
Output folder: /content/audio_output/

IMPORTANT: Please upload your .txt script files to the 'scripts' folder on the left.

--- Starting Audio Generation Process ---
Found 1 script(s) to process.

Processing script: Educational Dialogue Script Unpacki.txt
  > Could not parse dialogue from Educational Dialogue Script Unpacki.txt. Skipping.


--- All scripts processed! ---
You can find your generated WAV files in the '/content/audio_output/' folder.


In [None]:
# @title Step 1: Setup and Audio Generation with Gemini 2.5 Flash TTS
#
# This single cell handles everything: setup, parsing, and audio generation.
# 1. Sets up the necessary functions to process your scripts for the Gemini TTS API.
# 2. Reads all .txt files from the /content/scripts/ folder.
# 3. Generates a multi-speaker WAV audio file for each script using gemini-2.5-flash-preview-tts.
#
# ==> Just press the "Play" button on this cell to start.

import os
import re
import requests
import base64
import wave
import struct
from google.colab import files

# --- Configuration ---
# IMPORTANT: Replace the placeholder key with your actual Gemini API key.
api_key = "AIzaSyBvwzocmntpyrLburtWU_Vnz29hf4OBjIo" # <-- IMPORTANT: PASTE YOUR REAL API KEY HERE

# Define the voices for the characters using prebuilt Gemini TTS voices.
# The names now match your script file ("Dr. Science" and "Anmol").
VOICE_CONFIG = {
    "Dr. Science": "Kore",      # A firm, knowledgeable female-sounding voice for explaining.
    "Anmol": "Enceladus"      # A friendly, male-sounding voice for asking.
}

# Define the folder paths for input scripts and output audio
INPUT_SCRIPTS_FOLDER = "/content/scripts/"
OUTPUT_AUDIO_FOLDER = "/content/audio_output/"

# --- Main Setup ---
print("--- Gemini 2.5 Flash TTS Audio Generation ---")
print("\nCreating necessary folders...")
os.makedirs(INPUT_SCRIPTS_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_AUDIO_FOLDER, exist_ok=True)
print(f"Input folder: {INPUT_SCRIPTS_FOLDER}")
print(f"Output folder: {OUTPUT_AUDIO_FOLDER}")
print("\nIMPORTANT: Please upload your .txt script files to the 'scripts' folder on the left.")


# --- Core Functions ---

def parse_script_to_dialogue(file_path):
    """
    Reads a script file and formats it into a single string for the Gemini API.
    The API will use the speaker names (e.g., "Dr. Science:") to assign voices.
    """
    dialogue_lines = []
    # Define the speakers we are looking for in the script, matching your file.
    speakers = ["Dr. Science", "Anmol"]
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                stripped_line = line.strip()
                # Check if the line starts with one of our defined speaker names followed by a colon
                if any(stripped_line.startswith(f"{speaker}:") for speaker in speakers):
                    # Remove bracketed notes like "[...]" from the dialogue
                    clean_line = re.sub(r'\[.*?\]', '', stripped_line).strip()
                    dialogue_lines.append(clean_line)
    except FileNotFoundError:
        print(f"Error: Script file not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred while parsing {file_path}: {e}")
        return None

    # Join all dialogue lines into a single string, separated by newlines.
    return "\n".join(dialogue_lines) if dialogue_lines else None


def pcm_to_wav(pcm_data, sample_rate, output_filename):
    """
    Converts raw PCM audio data (16-bit) into a standard WAV file.
    The Gemini TTS API returns audio in this raw format.
    """
    with wave.open(output_filename, 'wb') as wf:
        wf.setnchannels(1)  # Mono audio
        wf.setsampwidth(2)  # 16-bit (2 bytes) per sample
        wf.setframerate(sample_rate)
        wf.writeframes(pcm_data)


def generate_audio_from_dialogue(dialogue_text, output_filename):
    """
    Calls the Gemini 2.5 Flash TTS API to generate multi-speaker audio.
    """
    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={api_key}"

    # Construct the multi-speaker voice configuration from our settings
    speaker_configs = [
        {"speaker": name, "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": voice}}}
        for name, voice in VOICE_CONFIG.items()
    ]

    # This is the request payload for the API
    payload = {
        "contents": [{
            "parts": [{
                "text": f"TTS the following conversation:\n{dialogue_text}"
            }]
        }],
        "generationConfig": {
            "responseModalities": ["AUDIO"],
            "speechConfig": {
                "multiSpeakerVoiceConfig": {
                    "speakerVoiceConfigs": speaker_configs
                }
            }
        },
        "model": "gemini-2.5-flash-preview-tts"
    }

    try:
        print(f"  > Sending request to Gemini API for '{os.path.basename(output_filename)}'...")
        response = requests.post(api_url, json=payload, headers={'Content-Type': 'application/json'})
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        response_json = response.json()

        # Extract the audio data from the response
        part = response_json.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0]
        audio_data_base64 = part.get('inlineData', {}).get('data')
        mime_type = part.get('inlineData', {}).get('mimeType')

        if not audio_data_base64 or not mime_type or not mime_type.startswith("audio/"):
            print(f"  > ERROR: Could not find valid audio data in API response.")
            print(f"    Response: {response_json}")
            return False

        # Decode the base64 string to raw bytes
        pcm_data = base64.b64decode(audio_data_base64)

        # The sample rate is included in the mimeType, e.g., "audio/L16;rate=24000"
        sample_rate_match = re.search(r'rate=(\d+)', mime_type)
        if not sample_rate_match:
            print(f"  > ERROR: Could not determine sample rate from mimeType: {mime_type}")
            return False
        sample_rate = int(sample_rate_match.group(1))

        # Convert the raw PCM data to a playable WAV file
        pcm_to_wav(pcm_data, sample_rate, output_filename)
        print(f"  > Successfully saved audio to '{output_filename}'")
        return True

    except requests.exceptions.HTTPError as http_err:
        print(f"  > HTTP ERROR: {http_err}")
        print(f"    Response Body: {response.text}")
        print("    Please check if your API key is correct and has access to the Gemini API.")
        return False
    except Exception as e:
        print(f"  > An unexpected error occurred: {e}")
        return False


# --- Main Execution Logic ---

def process_all_scripts():
    """
    Main function to find all scripts and process them one by one.
    """
    print("\n--- Starting Audio Generation Process ---")
    script_files = [f for f in os.listdir(INPUT_SCRIPTS_FOLDER) if f.endswith('.txt')]

    if not script_files:
        print("\nNo .txt script files found in the 'scripts' folder.")
        print("Please upload your files and run this cell again.")
        return

    print(f"Found {len(script_files)} script(s) to process.\n")

    for script_filename in script_files:
        input_path = os.path.join(INPUT_SCRIPTS_FOLDER, script_filename)
        print(f"Processing script: {script_filename}")

        # 1. Parse the script to create the dialogue string
        dialogue = parse_script_to_dialogue(input_path)
        if not dialogue:
            print(f"  > Could not parse dialogue from {script_filename}. Skipping.\n")
            continue

        # 2. Define the output filename (as .wav)
        base_name = os.path.splitext(script_filename)[0]
        output_path = os.path.join(OUTPUT_AUDIO_FOLDER, f"{base_name}.wav")

        # 3. Generate the audio
        generate_audio_from_dialogue(dialogue, output_path)
        print("-" * 20)

    print("\n--- All scripts processed! ---")
    print(f"You can find your generated WAV files in the '{OUTPUT_AUDIO_FOLDER}' folder.")

# Run the main process
process_all_scripts()


--- Gemini 2.5 Flash TTS Audio Generation ---

Creating necessary folders...
Input folder: /content/scripts/
Output folder: /content/audio_output/

IMPORTANT: Please upload your .txt script files to the 'scripts' folder on the left.

--- Starting Audio Generation Process ---
Found 1 script(s) to process.

Processing script: Educational Dialogue Script Unpacki.txt
  > Sending request to Gemini API for 'Educational Dialogue Script Unpacki.wav'...
