# YouTube Summarizer

This notebook will show you an example of how to:
- Use local LLM API via Ollama and again via LangChain
- Use Llama 3-8B model
- Build UI with Gradio
- Use case = "Summarize YouTube video using Llama 3"

Specifically, we will first retrieve a transcript of the target YouTube video (directed via a URL), then we will as Llama 3 to summarize it. We do it this way because Llama 3 only understand text at the moment.

In [14]:
# import pytube
import requests
from youtube_transcript_api import YouTubeTranscriptApi
import re
from pathlib import Path
from datetime import datetime
from urllib.request import urlopen
import html
import gradio as gr
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
import tiktoken

In [None]:
def get_youtube_info(url: str):
    """Get video title and description."""
    # try:
    video_id = extract_video_id(url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
        
    # Get video page content
    video_url = f"https://youtube.com/watch?v={video_id}"
    content = urlopen(video_url).read().decode('utf-8')
    
    # Extract title
    title_match = re.search(r'"title":"([^"]+)"', content)
    title = html.unescape(title_match.group(1)) if title_match else "Unknown Title"
    
    # Extract description
    desc_match = re.search(r'"description":{"simpleText":"([^"]+)"', content)
    description = html.unescape(desc_match.group(1)) if desc_match else "No description available"
    
    return title, description
    # except Exception as e:
    #     return {"title": "Error", "description": str(e)}
    
    
def extract_video_id(url):
    """Extract YouTube video ID from URL."""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_text_splitter(chunk_size: int, overlap_size: int):
    return RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=overlap_size)


def get_youtube_transcript(url):
    """
    Extract transcript from a YouTube video URL.
    
    Args:
        url (str): YouTube video URL
        
    Returns:
        str: Full transcript text
    """
    try:
        # Extract video ID from URL
        video_id = extract_video_id(url)
        if not video_id:
            raise ValueError("Invalid YouTube URL")
            
        # Get transcript
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        
        # Combine transcript pieces
        full_transcript = ' '.join(entry['text'] for entry in transcript_list)

        enc = tiktoken.encoding_for_model("gpt-4")
        count = len(enc.encode(full_transcript))
        
        return full_transcript, count
        
    except Exception as e:
        return f"Error: {str(e)}", 0


def get_transcription_summary(url: str, temperature: float, chunk_size: int, overlap_size: int):
    transcript, tokencount = get_youtube_transcript(url)
    docs = [Document(
        page_content=transcript,
        metadata={"source": url}
    )]

    text_splitter = get_text_splitter(chunk_size=chunk_size, overlap_size=overlap_size)
    split_docs = text_splitter.split_documents(docs)
    llm = Ollama(
        model="llama3.2",
        base_url="http://localhost:11434",
        temperature=temperature,
    )

    
    map_template = """Write a concise summary of this text section in bullet points.:
    {text}

    CONCISE SUMMARY:"""

    map_prompt = PromptTemplate(
        template=map_template,
        input_variables=["text"]
    )

    combine_template = """Combine these summaries into a final summary in bullet points.:
    {text}

    FINAL SUMMARY:"""

    combine_prompt = PromptTemplate(
    template=combine_template,
    input_variables=["text"]
    )


    chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=map_prompt, combine_prompt=combine_prompt)
    output = chain.invoke(split_docs)
    return output['output_text']

In [8]:
try:
    demo.close()
except:
    pass


with gr.Blocks() as demo:
    gr.Markdown("""# YouTube Summarizer with Llama 3
                """)
    with gr.Row(equal_height=True) as r0:
        with gr.Column(scale=4) as r0c1:
            url = gr.Textbox(label='YouTube URL', value="https://youtu.be/bvPDQ4-0LAQ")
        with gr.Column(scale=1) as r0c2:
            bttn_info_get = gr.Button('Get Info', variant='primary')
            bttn_clear = gr.ClearButton(interactive=True, variant='stop')
            
    with gr.Row(variant='panel') as r1:
        with gr.Column(scale=2) as r1c1:
            title = gr.Textbox(label='Title', lines=2, max_lines=10, show_copy_button=True)
        with gr.Column(scale=3, ) as r1c2:
            desc = gr.Textbox(label='Description', max_lines=10, autoscroll=False, show_copy_button=True)
            bttn_info_get.click(fn=get_youtube_info,
                                inputs=url,
                                outputs=[title, desc],
                                api_name="get_youtube_info")

    with gr.Row(equal_height=True) as r2:        
        with gr.Column() as r2c1:
            bttn_trns_get = gr.Button("Get Transcription", variant='primary')
            tkncount = gr.Number(label='Token Count (est)')
        with gr.Column() as r2c3:
            bttn_summ_get = gr.Button("Summarize", variant='primary')
            with gr.Row():
                with gr.Column(scale=1, min_width=100):
                    temperature = gr.Number(label='Temperature', minimum=0.0, step=0.01, precision=-2)
                with gr.Column(scale=1, min_width=100):
                    chunk = gr.Number(label='Chunk Size', minimum=200, step=100, value=4000)
                with gr.Column(scale=1, min_width=100):
                    overlap = gr.Number(label='Overlap Size', minimum=0, step=10, value=0)
        
    with gr.Row() as r3:
        with gr.Column() as r3c1:
            trns_raw = gr.Textbox(label='Transcript', show_copy_button=True)
        with gr.Column() as r3c2:
            trns_sum = gr.Textbox(label="Summary", show_copy_button=True)
    
    bttn_trns_get.click(fn=get_youtube_transcript,
                            inputs=url,
                            outputs=[trns_raw, tkncount]
                            )
    bttn_summ_get.click(fn=get_transcription_summary,
                                inputs=[url, temperature, chunk, overlap],
                                outputs=trns_sum)
    
    bttn_clear.add([url, title, desc, trns_raw, trns_sum, tkncount])

demo.launch(share=False)

Closing server running on port: 7860
* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


