# Data Collection Pipeline: YouTube → Audio → Transcript
This notebook downloads a YouTube video, extracts audio, and transcribes it using OpenAI Whisper.

In [2]:
import sys
sys.path.append('../')

In [3]:
import yt_dlp
import openai
from pathlib import Path
import json



In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [5]:
import subprocess

video_url = "https://www.youtube.com/watch?v=SN-vBnWj6e8"
output_path = "../data/eleo_audio.mp3"

def download_audio(video_url, output_path):
    try:
        result = subprocess.run(
            ["yt-dlp", "-x", "--audio-format", "mp3", "-o", output_path, video_url],
            check=True,
            capture_output=True,
            text=True
        )
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error during download:", e.stderr)

download_audio(video_url, output_path)


[youtube] Extracting URL: https://www.youtube.com/watch?v=SN-vBnWj6e8
[youtube] SN-vBnWj6e8: Downloading webpage
[youtube] SN-vBnWj6e8: Downloading tv client config
[youtube] SN-vBnWj6e8: Downloading tv player API JSON
[youtube] SN-vBnWj6e8: Downloading ios player API JSON
[youtube] SN-vBnWj6e8: Downloading m3u8 information
[info] SN-vBnWj6e8: Downloading 1 format(s): 251
[download] ../data/eleo_audio.mp3 has already been downloaded
[ExtractAudio] Not converting audio ../data/eleo_audio.mp3; file is already in target format mp3



In [6]:
# Check if the file was created successfully
if os.path.exists(output_path):
    print(f"Audio file downloaded successfully: {output_path}")
else:
    print("Failed to download the audio file.")

Audio file downloaded successfully: ../data/eleo_audio.mp3


In [7]:
# Install if not already installed
#!pip install openai
#!pip install python-dotenv
#!pip install langchain

### Transcribe with Whisper
As first it seemed smart to use youtube-transcript-api, but in this task we can rely on auto captions, when content is not fully in english. Hence Whisper will be a better choice.

In [8]:
from utils.whisper import transcribe_audio

transcript_text = transcribe_audio(
    file_path="../data/eleo_audio.mp3",
    save_path="../data/eleo_transcript.txt",
    save_json=True
)

with open("../data/eleo_transcript.json", "w", encoding="utf-8") as f:
    json.dump({"transcript": transcript_text}, f, indent=2)


print(transcript_text[:300])

print("Transcript word count:", len(transcript_text.split()))



Hallo! Hi! Ich werde jetzt multitasken. Ein sehr deutsches Wort. Also, ich werde jetzt mehrere Dinge gleichzeitig machen. Ich werde jetzt eure Fragen beantworten. Ihr habt mir nämlich sehr viele Fragen auf Instagram und hier auf dem YouTube-Kanal gestellt. Danke für die ganzen Fragen. Und die werde 
Transcript word count: 1699


### Let's add meta data for the structure



In [9]:
from utils.metadata_extract import extract_youtube_metadata
import json
from pathlib import Path

# Define OUTPUT_DIR and audio_file
OUTPUT_DIR = Path("../data")
audio_file = Path("../data/eleo_audio.mp3")

metadata = extract_youtube_metadata(video_url)
metadata_output = OUTPUT_DIR / (audio_file.stem + "_metadata.json")
metadata_output.write_text(json.dumps(metadata, indent=2), encoding="utf-8")
print(f"Metadata saved to: {metadata_output}")


Metadata saved to: ../data/eleo_audio_metadata.json
