# Libraries

In [66]:
%%capture
!pip install pytube
!pip install assemblyai
!pip install --upgrade pytube
!pip install yt-dlp
!pip install openai
!pip install python-dotenv

In [67]:
from pytube import YouTube
import os
import assemblyai as aai
from assemblyai.types import TranscriptionConfig
import yt_dlp
from openai import OpenAI
import json
import re
import requests
from dotenv import load_dotenv

# Env Var

In [71]:
# URL de la vidéo à illustrer
URL = "https://youtu.be/FZGHtLw4ZPY"
dir_path = "./"
audio_file_name = "audio.webm"
audio_txt_path = "/content/audio.txt"
audio_json_path = "/content/audio.json"
audio_file_path = os.path.join(dir_path, audio_file_name)

# img
IMG_MODEL_TYPE = "dall-e-3" # "dall-e-2"
IMG_SIZE = "1024x1024"
IMG_QUALITY = "standard"    # hd
IMG_DIR = "images"
IMG_EXTENTION = "png"       # jpg
img_json_path = "img_url.json"

# prompts and model config
EXTRACT_PROMPT = """
You are an expert in analyzing text to identify sections that can be illustrated with images, especially for children.
Your task is to carefully process a piece of text written in French and provide concise, kid-friendly prompts for each image to be generated.

Here are the guidelines you must strictly follow:
* Analyze the provided French text and identify meaningful portions that can be illustrated.
* For each identified portion, generate a concise and descriptive prompt in English that describes a vivid and kid-friendly scene.
* Ensure the prompts are specifically designed to generate images that are visually appealing to children.
* The output must be a JSON object where:
  - Each key is a portion of the original French text.
  - Each value is the corresponding image-generation prompt in English.
* The response **must be strictly in JSON format**, with no additional text, explanations, or formatting outside the JSON structure.
* If the text has no identifiable portions suitable for illustration, return an empty JSON object: `{}`.
* The JSON format must adhere to this structure:
  ```json
  {
      "portion of the original text here": "the prompt to generate a nice illustration of the scene for kids",
      ...
  }
"""
MODEL_TYPE = "gpt-4o"

# api keys
load_dotenv(".env")

AAI_API_KEY = str(os.getenv("AAI_API_KEY"))
OPENAI_API_KEY = str(os.getenv("OPENAI_API_KEY"))

#  1. Download the audio from the video

I modified the original code I received because I couldn't resolve some issues and didn't have enough time to focus on them, so I opted for another approach to

In [40]:
# Download audio
ydl_opts = {
    "format": "bestaudio/best",
    "outtmpl": f"{dir_path}audio.%(ext)s",
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([URL])

[youtube] Extracting URL: https://youtu.be/FZGHtLw4ZPY
[youtube] FZGHtLw4ZPY: Downloading webpage
[youtube] FZGHtLw4ZPY: Downloading ios player API JSON
[youtube] FZGHtLw4ZPY: Downloading mweb player API JSON
[youtube] FZGHtLw4ZPY: Downloading m3u8 information
[info] FZGHtLw4ZPY: Downloading 1 format(s): 251
[download] ./audio.webm has already been downloaded
[download] 100% of    3.35MiB


# 2. Transcription de l'audio (Speech to text)

I commented out this code because I don't want to redo the transcription since I'm focusing on a single audio file 🙂, and I'm worried about running out of my API key credits.

In [47]:
# # API key
# aai.settings.api_key = AAI_API_KEY

# # init config
# config = TranscriptionConfig(
#     language_code="fr",   # French
#     punctuate=True,       # punctuation
#     format_text=True,     # Format text
# )

# # init Transcriber
# transcriber = aai.Transcriber(config=config)

# try:
#     # Transcribe
#     transcript = transcriber.transcribe(audio_file_path)

#     # Wait for the transcription to complete
#     transcript.wait_for_completion()

#     # Let's save the transcription (because we cannot afford re-doing it)
#     with open("audio.txt", "w", encoding="utf-8") as file:
#         file.write(transcript.text)

#     print("Transcription saved to audio.txt!")
# except Exception as e:
#     print(f"An error occurred: {e}")

# 3. Exploitation and Analysis of Transcription

In the following section, we will focus on crafting an effective prompt to identify various sections of the text that can be illustrated with images.

For now, our objective is to request GPT-4 to generate the most suitable prompt for creating each image. This will help ensure that the generated visuals align closely with the text's content and purpose.

In [55]:
def generate_image_prompts(
    text,
    client,
    model_type,
    extract_prompt,
    audio_json_path,
    temperature=1,
    max_tokens=1024,
    top_p=1,
    is_debug=False,
):
    # Call OpenAI API
    response = client.chat.completions.create(
        model=model_type,
        messages=[
            {
                "role": "system",
                "content": extract_prompt
            },
            {
                "role": "user",
                "content": f"Text: {text}"
            }
        ],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
    )

    # raw response from gpt
    raw_response = response.choices[0].message.content

    if is_debug:
        print("Raw response from OpenAI API:")
        print(raw_response)

    # let's use regex to extract json (we cannot blindly trust llms)
    pattern = r"```json\s*(\{[\s\S]*?\})\s*```"
    match = re.search(pattern, raw_response)

    if match:
        # If found, capture the JSON inside
        raw_json_str = match.group(1)
    else:
        # Fallback pattern: look for any top-level curly-braced structure (could be wrong but let's do it)
        fallback_pattern = r"\{[\s\S]*?\}"
        fallback_match = re.search(fallback_pattern, raw_response)

        if fallback_match:
            raw_json_str = fallback_match.group(0)
        else:
            print("No valid JSON code block found in the response.")

    try:
        # Attempt to parse the extracted string as JSON
        json_data = json.loads(raw_json_str)

        # Validate we got a dictionary
        if not isinstance(json_data, dict):
            raise ValueError("The response is not a valid JSON object.")

        # Validate that keys and values are strings
        for key, value in json_data.items():
            if not isinstance(key, str) or not isinstance(value, str):
                raise ValueError("JSON keys and values must be strings.")

        # Save the validated JSON to a file
        with open(audio_json_path, "w", encoding="utf-8") as file:
            json.dump(json_data, file, indent=4, ensure_ascii=False)

        print(f"Validated JSON saved to {audio_json_path}")

    except (json.JSONDecodeError, ValueError) as e:
        print(f"Error: The response could not be processed as valid JSON. Details: {e}")

I've commented out this section because it costs money to run it repeatedly, and I'm using a key with only $10, so I'm not sure how many times it allows me to experiment with different prompts and implementations of our helper function.

Therefore, I could not experiment with different prompts, models, and configurations either.

In [56]:
# client = OpenAI(api_key=OPENAI_API_KEY)

# # Read text from the file
# with open(audio_txt_path, "r", encoding="utf-8") as file:
#     text = file.read()

# # Generate image prompts
# generate_image_prompts(
#     text=text,
#     client=client,
#     model_type=MODEL_TYPE,
#     extract_prompt=EXTRACT_PROMPT,
#     audio_json_path=audio_json_path,
#     temperature=1,
#     max_tokens=1024,
#     top_p=1,
#     is_debug=True,
# )

Raw response from OpenAI API:
```json
{
    "La grenouille à grande bouche gobe des mouches avec sa grande bouche.": "A big-mouthed frog catching flies with its enormous mouth.",
    "Elle vit dans une mare sur un énuphar qui lui sert de plongeoir.": "A big-mouthed frog sitting on a lily pad in a pond, ready to jump like on a diving board.",
    "Au premier tournant, elle rencontre un ruban.": "The big-mouthed frog meeting an anteater with a long, sticky tongue like a ribbon.",
    "D'un bon guirée, elle traverse une forêt.": "The big-mouthed frog hopping energetically through a lush, green forest.",
    "T'es grande, toi ! T'es qui, toi ? Je suis une gira.": "The big-mouthed frog staring up at a tall giraffe.",
    "À l'aide d'une canne, elle escala d'une montagne de mille kilogrammes.": "The big-mouthed frog climbing a very steep mountain with the help of a vine or stick.",
    "T'es gros, toi ! T'es qui, toi ? Je suis le rhinocéros.": "The big-mouthed frog looking at a large, friend

{'La grenouille à grande bouche gobe des mouches avec sa grande bouche.': 'A big-mouthed frog catching flies with its enormous mouth.',
 'Elle vit dans une mare sur un énuphar qui lui sert de plongeoir.': 'A big-mouthed frog sitting on a lily pad in a pond, ready to jump like on a diving board.',
 'Au premier tournant, elle rencontre un ruban.': 'The big-mouthed frog meeting an anteater with a long, sticky tongue like a ribbon.',
 "D'un bon guirée, elle traverse une forêt.": 'The big-mouthed frog hopping energetically through a lush, green forest.',
 "T'es grande, toi ! T'es qui, toi ? Je suis une gira.": 'The big-mouthed frog staring up at a tall giraffe.',
 "À l'aide d'une canne, elle escala d'une montagne de mille kilogrammes.": 'The big-mouthed frog climbing a very steep mountain with the help of a vine or stick.',
 "T'es gros, toi ! T'es qui, toi ? Je suis le rhinocéros.": 'The big-mouthed frog looking at a large, friendly rhinoceros.',
 'Un peu plus tard, il se met à pleuvoir.': 

# 4. Génération des illustrations



Now let's generate images from the descriptions generated in the audio.json file using a model that generates images from text.

In [63]:
# let's first create a diractory to store images
!mkdir -p images

In [64]:
def generate_images_from_json(
    json_data,
    client,
    model_type,
    img_size,
    img_quality,
    img_dir,
    img_json_path,
    extention,
    is_debug=False,
):

    # let's check if img_dir is created
    if not os.path.exists(img_dir):
        os.makedirs(img_dir, exist_ok=True)

    if not isinstance(json_data, dict):
        raise ValueError("JSON file must contain an object (key-value pairs).")

    # generating image for each prompt
    json_result = {}
    for key, prompt in json_data.items():
        if not isinstance(prompt, str):
            print(f"Skipping {key}: Prompt is not a string.")
            continue

        print(f"Generating image for key: '{key}' with prompt: '{prompt}'...")
        try:
            # Call to OpenAI Image Model
            response = client.images.generate(
                model=model_type,
                prompt=prompt,
                size=img_size,
                quality=img_quality,
                n=1
            )

            # url of the image
            image_url = response.data[0].url
            json_result[key] = {
                "prompt": prompt,
                "url": image_url
            }

            # download and save image in img_dir
            image_filename = f"{key}.{extention}"
            image_path = os.path.join(img_dir, image_filename)
            img_data = requests.get(image_url).content

            with open(image_path, "wb") as handler:
                handler.write(img_data)

            print(f"Saved image to {image_path}")

        except Exception as e:
            print(f"Error generating image for key '{key}': {e}")

    # let's save {prompt: img_url}
    with open(img_json_path, "w", encoding="utf-8") as f_out:
        json.dump(json_result, f_out, indent=4, ensure_ascii=False)
    print(f"Saved all generated image URLs to {img_json_path}")

I have commented out this section because I can't afford to run it repeatedly 😊.

For the same reason, I couldn't experiment with different manipulations of the prompts to make them align more closely with the subject. The images should be kid-friendly, more like paintings rather than realistic images, and perhaps the scene should be unified across all images and more...

This is a very interesting project but it requires time and effort 🙂.

In [72]:
# # Init client
# client = OpenAI(api_key=OPENAI_API_KEY)

# # Load JSON
# with open(audio_json_path, "r", encoding="utf-8") as f:
#     json_data = json.load(f)

# # just for test
# json_data = dict(list(json_data.items())[:2])

# # finally :) let's generate images
# generate_images_from_json(
#     json_data=json_data,
#     client=client,
#     model_type=IMG_MODEL_TYPE,
#     img_size=IMG_SIZE,
#     img_quality=IMG_QUALITY,
#     img_dir=IMG_DIR,
#     img_json_path=img_json_path,
#     extention=IMG_EXTENTION,
#     is_debug=True,
# )

# 5. Tool Presentation with Perplexity

I have created this section in a LaTeX document because books do not look appealing with too much text.