<a href="https://colab.research.google.com/github/adarshtomar333/YouTube-Parser/blob/main/YTP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install yt-dlp
!pip install -U openai-whisper


Collecting yt-dlp
  Downloading yt_dlp-2025.2.19-py3-none-any.whl.metadata (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.9/171.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.2.19-py3-none-any.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m100.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.2.19
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m15.3 MB/s[

In [1]:
!apt-get install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [2]:
import os
import time
import yt_dlp
import subprocess
import whisper
import re

# Path to store the downloaded and processed audio
OUTPUT_DIR = "/content/YT"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Function to sanitize filenames (remove invalid characters)
def sanitize_filename(title):
    return re.sub(r'[\\/*?:"<>|]', "_", title)

# Function to get video title
def get_video_title(video_url):
    ydl_opts = {"quiet": True, "no_warnings": True, "skip_download": True, "extract_flat": True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        return sanitize_filename(info["title"]) if "title" in info else "video_audio"

# Function to download audio from YouTube
def download_audio(video_url):
    title = get_video_title(video_url)
    audio_path = os.path.join(OUTPUT_DIR, f"{title}.mp3")

    ydl_opts = {
        "format": "bestaudio/best",
        "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3", "preferredquality": "192"}],
        "outtmpl": os.path.join(OUTPUT_DIR, f"{title}.%(ext)s"),  # Store using title
    }

    try:
        print(f"\n📥 Downloading audio for '{title}'...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])

        time.sleep(2)  # Give time for the file to save

        # Check if the file exists
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"❌ MP3 file not found after download!")

        print(f"✅ Audio downloaded successfully at {audio_path}.")
        return title  # Return the title for further steps
    except Exception as e:
        print(f"❌ Error downloading audio: {e}")
        exit()

# Function to convert MP3 to WAV
def convert_to_wav(title):
    mp3_path = os.path.join(OUTPUT_DIR, f"{title}.mp3")
    wav_path = os.path.join(OUTPUT_DIR, f"{title}.wav")

    if not os.path.exists(mp3_path):
        print("❌ Error: MP3 file not found!")
        exit()

    print(f"\n🎵 Converting '{title}.mp3' to WAV...")
    command = [
        "ffmpeg", "-i", mp3_path, "-ar", "16000", "-ac", "1",
        "-c:a", "pcm_s16le", wav_path, "-y"
    ]

    try:
        subprocess.run(command, check=True)
        print(f"✅ Audio converted to WAV: {wav_path}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error converting MP3 to WAV: {e}")
        exit()

# Function to transcribe audio using Whisper in English
def transcribe_audio(title):
    """Transcribe audio using Whisper with automatic language detection."""
    wav_path = os.path.join(OUTPUT_DIR, f"{title}.wav")
    transcript_file = os.path.join(OUTPUT_DIR, f"{title}.txt")

    if not os.path.exists(wav_path):
        print("❌ Error: WAV file not found!")
        exit()

    print(f"\n📝 Transcribing '{title}.wav' with automatic language detection...")
    model = whisper.load_model("large")  # Use "small" or "large" for better accuracy
    result = model.transcribe(wav_path)  # No language specified for automatic detection

    transcript = result["text"]
    print("\n✅ Transcript Ready!\n")

    # Save transcript to a file
    with open(transcript_file, "w", encoding="utf-8") as f:
        f.write(transcript)

    print(f"📄 Transcript saved to: {transcript_file}")

# Function to clean up temporary files
def cleanup_files(title):
    mp3_path = os.path.join(OUTPUT_DIR, f"{title}.mp3")
    wav_path = os.path.join(OUTPUT_DIR, f"{title}.wav")

    try:
        if os.path.exists(mp3_path):
            os.remove(mp3_path)
            print(f"🗑️ Deleted temporary file: {mp3_path}")
        if os.path.exists(wav_path):
            os.remove(wav_path)
            print(f"🗑️ Deleted temporary file: {wav_path}")
    except Exception as e:
        print(f"❌ Error cleaning up files: {e}")

# Main execution
video_url = input("\n🎥 Enter YouTube video URL: ").strip()
title = download_audio(video_url)
convert_to_wav(title)
transcribe_audio(title)
cleanup_files(title)  # Clean up temporary files


🎥 Enter YouTube video URL: https://www.youtube.com/watch?v=yqL6ss0uR5o&t=1s

📥 Downloading audio for 'The Geography of Uttarakhand'...
[youtube] Extracting URL: https://www.youtube.com/watch?v=yqL6ss0uR5o&t=1s
[youtube] yqL6ss0uR5o: Downloading webpage
[youtube] yqL6ss0uR5o: Downloading tv client config
[youtube] yqL6ss0uR5o: Downloading player 5ae7d525
[youtube] yqL6ss0uR5o: Downloading tv player API JSON
[youtube] yqL6ss0uR5o: Downloading ios player API JSON
[youtube] yqL6ss0uR5o: Downloading m3u8 information
[info] yqL6ss0uR5o: Downloading 1 format(s): 251
[download] Destination: /content/YT/The Geography of Uttarakhand.webm
[download] 100% of   18.34MiB in 00:00:00 at 25.97MiB/s  
[ExtractAudio] Destination: /content/YT/The Geography of Uttarakhand.mp3
Deleting original file /content/YT/The Geography of Uttarakhand.webm (pass -k to keep)
✅ Audio downloaded successfully at /content/YT/The Geography of Uttarakhand.mp3.

🎵 Converting 'The Geography of Uttarakhand.mp3' to WAV...
✅ Aud

  checkpoint = torch.load(fp, map_location=device)
