<a href="https://colab.research.google.com/github/apparel2020/my-second-repo/blob/main/YouTube%20downloader%20and%20summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# YouTube Audio Transcription and Summarization using Open Source Models
# For Google Colab Free Tier - With YouTube Bot Detection Fix

# Install required packages
!pip install yt-dlp transformers sentencepiece datasets accelerate torch bitsandbytes peft optimum
!pip install git+https://github.com/openai/whisper.git

# Import necessary libraries
import os
import re
import torch
import subprocess
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import whisper

# Function to download YouTube audio using yt-dlp (more reliable than pytubefix)
def youtube_audio_downloader(link):
    if not link or ('youtube.com' not in link and 'youtu.be' not in link):
        print('Invalid YouTube link!')
        return False

    print('Downloading the audio stream...')

    # Create output filename based on current timestamp
    import time
    output_filename = f"audio_{int(time.time())}.mp3"

    # Use yt-dlp which has better anti-bot-detection capabilities
    command = [
        'yt-dlp',
        '-x',  # Extract audio
        '--audio-format', 'mp3',  # Convert to mp3
        '--audio-quality', '0',  # Best quality
        '-o', output_filename,  # Output filename
        link  # YouTube URL
    ]

    try:
        subprocess.run(command, check=True)
        if os.path.exists(output_filename):
            print('Download completed successfully!')
            return output_filename
        else:
            print('Error: Download completed but file not found!')
            return False
    except subprocess.CalledProcessError as e:
        print(f'Error downloading the file: {e}')
        # Try alternate method if first method fails
        try:
            print('Trying alternate download method...')
            alt_command = [
                'yt-dlp',
                '-f', 'bestaudio',  # Best audio format available
                '--extract-audio',
                '--audio-format', 'mp3',
                '--audio-quality', '0',
                '-o', output_filename,
                '--no-check-certificates',  # Skip HTTPS certificate validation
                '--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
                link
            ]
            subprocess.run(alt_command, check=True)
            if os.path.exists(output_filename):
                print('Alternate download method succeeded!')
                return output_filename
            else:
                print('Error: Alternate download completed but file not found!')
                return False
        except subprocess.CalledProcessError as e2:
            print(f'Error with alternate download method: {e2}')
            return False

# Function to transcribe audio using Whisper (open source version)
def transcribe(audio_file, not_english=False):
    if not os.path.exists(audio_file):
        print('Audio file does not exist!')
        return False

    print('Loading Whisper model...')
    # Use a smaller model to fit within Colab's free tier memory constraints
    model = whisper.load_model("base")

    print('Starting transcription...')
    if not_english:
        # Translate to English
        result = model.transcribe(audio_file, task="translate")
    else:
        # Just transcribe
        result = model.transcribe(audio_file)
    print('Transcription completed!')

    name, extension = os.path.splitext(audio_file)
    transcript_filename = f'transcript-{name}.txt'
    with open(transcript_filename, 'w', encoding='utf-8') as f:
        f.write(result["text"])

    print(f'Transcript saved to {transcript_filename}')
    return transcript_filename

# Function to summarize text using an open source LLM (T5 version)
def summarize(transcript_filename):
    if not os.path.exists(transcript_filename):
        print('The transcript file does not exist!')
        return False

    with open(transcript_filename, 'r', encoding='utf-8') as f:
        transcript = f.read()

    print('Loading summarization model (FLAN-T5)...')
    # Use FLAN-T5 for summarization - efficient and works well on Colab's free tier
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using device: {'CUDA' if device == 0 else 'CPU'}")

    summarizer = pipeline(
        "summarization",
        model="google/flan-t5-base",
        device=device
    )

    # Handle long transcripts by splitting into chunks
    max_input_length = 500  # T5 has limited context window

    if len(transcript) <= max_input_length:
        chunks = [transcript]
    else:
        # Split by sentences to preserve meaning
        import re
        sentences = re.split(r'(?<=[.!?])\s+', transcript)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) < max_input_length:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "

        if current_chunk:
            chunks.append(current_chunk.strip())

    print(f'Processing transcript in {len(chunks)} chunks...')

    # Process each chunk
    summary_chunks = []
    for i, chunk in enumerate(chunks):
        print(f'Summarizing chunk {i+1}/{len(chunks)}...')

        # Add summarization prompt
        prompt = f"""Summarize this text: {chunk}"""

        summary_part = summarizer(prompt, max_length=150, min_length=30)
        summary_chunks.append(summary_part[0]['summary_text'])

    # Combine all summaries
    if len(summary_chunks) > 1:
        print('Generating final summary from all chunks...')
        combined_summary = " ".join(summary_chunks)
        final_prompt = f"""Create a coherent summary with a title, introduction,
        key points as bullet points, and a conclusion from this text: {combined_summary}"""

        final_summary = summarizer(final_prompt, max_length=300, min_length=100)[0]['summary_text']
    else:
        final_summary = summary_chunks[0]

    print('Summarization completed!')
    return final_summary

# Alternative summarization function using a different open source model
def summarize_with_llama(transcript_filename):
    """Use a Llama-based model for summarization. This is more powerful but may require
    more resources than the T5 model."""

    if not os.path.exists(transcript_filename):
        print('The transcript file does not exist!')
        return False

    print('Loading TinyLlama for summarization...')
    # Use TinyLlama which is smaller and can run on Colab's free tier
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    # Load in 4-bit to save memory
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        load_in_4bit=True,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    with open(transcript_filename, 'r', encoding='utf-8') as f:
        transcript = f.read()

    # Calculate available context window
    max_input_length = 2048  # Define maximum context length

    # Handle long transcripts
    if len(transcript) > max_input_length - 500:  # Reserve 500 tokens for the prompt
        print(f"Transcript too long ({len(transcript)} chars), truncating to fit context window")
        transcript = transcript[:max_input_length - 500]

    # Create a prompt with instructions
    prompt = f"""<|system|>
You are a helpful AI assistant that creates concise summaries.
<|user|>
Create a summary of the following text.
Text: {transcript}

Add a title to the summary.
Your summary should be informative and factual, covering the most important aspects of the topic.
Start your summary with an INTRODUCTION PARAGRAPH that gives an overview of the topic FOLLOWED by BULLET POINTS if possible AND end the summary with a CONCLUSION PHRASE.
<|assistant|>
"""

    print('Generating summary...')
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=500,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response
    summary = summary.split("<|assistant|>")[-1].strip()

    print('Summary generation completed!')
    return summary

# Main execution
if __name__ == "__main__":
    print("="*50)
    print("YouTube Transcription and Summarization Tool")
    print("(Using Open Source Models on Google Colab Free Tier)")
    print("="*50)

    # Ask for YouTube link
    link = input('Enter the YouTube video URL: ')

    # Download audio
    mp3_file = youtube_audio_downloader(link)
    if not mp3_file:
        print("Failed to download audio. Please try a different video or check the URL.")
        exit()

    # Ask if the content is not in English
    not_english_input = input('Is the content not in English? (y/n): ').lower()
    not_english = not_english_input.startswith('y')

    # Transcribe
    transcript_file = transcribe(mp3_file, not_english=not_english)
    if not transcript_file:
        print("Failed to transcribe audio.")
        exit()

    # Choose summarization method based on available resources
    print("\nChoose summarization method:")
    print("1. FLAN-T5 (faster, less RAM usage)")
    print("2. TinyLlama (better quality, more RAM required)")

    model_choice = input('Enter your choice (1 or 2): ')

    try:
        if model_choice == '2':
            summary = summarize_with_llama(transcript_file)
        else:
            summary = summarize(transcript_file)

        # Save summary to file
        summary_file = f"summary_{os.path.basename(transcript_file)}"
        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write(summary)

        print('\n\nSUMMARY:')
        print('='*50)
        print(summary)
        print('='*50)
        print(f"\nSummary saved to {summary_file}")

    except Exception as e:
        print(f"Error during summarization: {e}")
        print("If you selected TinyLlama and encountered an error, try FLAN-T5 instead (it requires less RAM).")

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-568hd2q6
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-568hd2q6
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
YouTube Transcription and Summarization Tool
(Using Open Source Models on Google Colab Free Tier)
Enter the YouTube video URL: https://www.youtube.com/watch?v=CBYhVcO4WgI&t=308s&pp=ygUSc3RhcnR1cCBzYW0gYWx0bWFu
Downloading the audio stream...
Download completed successfully!
Is the content not in English? (y/n): y
Loading Whisper model...
Starting transcription...
Transcription completed!
Transcript saved to transcript-audio_1746612046.txt

Choose summarization method:
1. 

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading TinyLlama for summarization...
Transcript too long (48608 chars), truncating to fit context window
Generating summary...
Summary generation completed!


SUMMARY:
Welcome to CS1-83B, the nine-year-old startup incubator that teaches 17 classes and offers advice to 720 companies. The guest speakers are experts in the creation of billion-dollar companies, and the advice is based on practical experience.

Summary saved to summary_transcript-audio_1746612046.txt
