<a href="https://colab.research.google.com/github/ayushd204/tubeTalk/blob/side-branch/main_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---
# CELL 1: INSTALLING DEPENDENCIES (UPDATED)
# ---
!pip install --upgrade -q langchain langchain-community langchain-core langchain_google_genai google-generativeai==0.1.3 faiss-cpu youtube_transcript_api sentence-transformers gradio

In [None]:
# ---
# CELL 2: IMPORTING LIBRARIES
# ---
# We've added 'getpass' for secure API key entry, re' for URL processing,
# 'os' for file system checks, and more from 'gradio' and 'langchain'.

import os
import re
import getpass
import gradio as gr

# For transcript loading
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

# For LangChain components
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

In [None]:
# CELL 3: SETTING UP THE GOOGLE API KEY
# This uses getpass to create a secure password-style prompt
# (Remember to generate a new key and delete the one you exposed)
if 'GOOGLE_API_KEY' not in os.environ:
    os.environ['GOOGLE_API_KEY'] = getpass.getpass("Enter your Google API Key: ")

GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']

In [None]:
import google.generativeai as genai
import textwrap

# Make sure you have run CELL 3 to define GOOGLE_API_KEY
try:
    genai.configure(api_key=GOOGLE_API_KEY)
except NameError:
    print("Error: GOOGLE_API_KEY is not defined. Please run Cell 3 first.")
except Exception as e:
    print(f"An error occurred during configuration: {e}")

print("--- Listing Available Google Models ---")

try:
    for m in genai.list_models():
        # 'generateContent' is for text/chat (LLM)
        if 'generateContent' in m.supported_generation_methods:
            print(f"\nâœ… Text Model: {m.name}")
            print(f"   Description: {textwrap.shorten(m.description, width=100)}")

        # 'embedContent' is for embeddings (like the one you replaced)
        if 'embedContent' in m.supported_generation_methods:
            print(f"\nâœ… Embedding Model: {m.name}")
            print(f"   Description: {textwrap.shorten(m.description, width=100)}")

except Exception as e:
    print(f"\nAn error occurred while listing models: {e}")
    print("Please ensure your API key is correct and has the 'Generative Language API' enabled in your Google Cloud project.")

print("\n-----------------------------------------")

In [None]:
# ---
# CELL 4: DEFINING CORE HELPER FUNCTIONS
# ---
# These are the reusable building blocks of our application.

# This is the same function you had for loading the embedding model.
def download_embeddings():
    """Downloads the sentence transformer model."""
    print("Downloading embedding model...")
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    print("Embedding model downloaded.")
    return embeddings

def get_video_id(url):
    """
    Helper function to extract the video ID from various YouTube URL formats.
    This is new and necessary so the user can paste any YouTube link.
    """
    # Regex to find video ID in various URL formats
    regex = (
        r"(?:https?:\/\/)?(?:www\.)?"
        r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|"
        r"youtu\.be\/)([a-zA-Z0-9_-]{11})"
    )
    match = re.search(regex, url)
    if match:
        return match.group(1)
    return None

def create_rag_chain(vector_store):
    """
    Creates the full RAG (Retrieval-Augmented Generation) chain.
    This replaces the logic you had inside your old 'chatbot_interface'.
    It's better practice to define the chain separately.
    """
    # 1. Define the Retriever
    retriever = vector_store.as_retriever(search_type="similarity", search_k=3)

    # 2. Define the Prompt Template
    template = """
    You are a helpful YouTube assistant. Use the following context to answer the user's question.
    If you don't know the answer, just say "I don't know." Do not try to make up an answer.
    Answer in a conversational and helpful tone.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:
    """
    prompt = PromptTemplate.from_template(template)

    # 3. Define the LLM
    llm = GoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=GOOGLE_API_KEY)

    # 4. Create the Chain using LangChain Expression Language (LCEL)
    # This is a more modern and robust way to build your chain.

    # This part runs in parallel:
    # - "context": The user's question is passed to the retriever to get relevant docs.
    # - "question": The user's question is passed through unchanged.
    setup_and_retrieval = RunnableParallel(
        {"context": retriever, "question": RunnablePassthrough()}
    )

    # The output of the parallel step is fed into the prompt,
    # then to the LLM, and finally parsed as a string.
    chain = setup_and_retrieval | prompt | llm | StrOutputParser()

    return chain

In [None]:
# ---
# CELL 5: DEFINING GRADIO UI FUNCTIONS
# ---
# These are the functions that Gradio will call directly.

# We'll use this constant to store our saved index
INDEX_NAME = "faiss_video_index"

def process_video(url, embeddings_model):
    """
    This function is called when the user clicks "Process Video".

    *** FIX ***: We are now returning 3 values to update each
    component (status, chatbot, textbox) individually,
    avoiding the buggy gr.Group.
    """
    if not url:
        # Returns: (status_msg, chatbot_update, textbox_update)
        return (
            "Please enter a YouTube URL.",
            gr.update(value=[]),
            gr.update(interactive=False)
        )

    print(f"Processing video: {url}")

    try:
        # Step 1: Get Video ID
        video_id = get_video_id(url)
        if not video_id:
            return (
                "Error: Invalid YouTube URL.",
                gr.update(value=[]),
                gr.update(interactive=False)
            )

        print(f"Extracted Video ID: {video_id}")

        # Step 2: Load and Split Documents using YoutubeLoader
        print("Loading transcript with YoutubeLoader...")
        loader = YoutubeLoader(video_id=video_id, add_video_info=False)
        docs = loader.load()

        if not docs:
            print("Error: No transcript found or loader failed.")
            return (
                "Error: Could not load transcript from this video.",
                gr.update(value=[]),
                gr.update(interactive=False)
            )

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        chunks = text_splitter.split_documents(docs)
        print(f"Transcript split into {len(chunks)} chunks.")

        # Step 3: Create and Save Vector Store
        if os.path.exists(INDEX_NAME):
            print(f"Removing old index: {INDEX_NAME}")
            import shutil
            shutil.rmtree(INDEX_NAME)

        print("Creating new vector store...")
        vector_store = FAISS.from_documents(documents=chunks, embedding=embeddings_model)

        vector_store.save_local(INDEX_NAME)
        print(f"Vector store saved to disk as '{INDEX_NAME}'.")

        # Return a success message and enable the chat textbox
        success_message = f"âœ… Successfully processed video! ({len(chunks)} chunks created). You can now ask questions."

        # Returns: (status_msg, chatbot_update, textbox_update)
        return (
            success_message,
            gr.update(value=[]), # Clears the chatbot window
            gr.update(interactive=True, placeholder="Ask your question here...") # Enables the textbox
        )

    except Exception as e:
        print(f"An unknown error occurred: {e}")
        # Returns: (status_msg, chatbot_update, textbox_update)
        return (
            f"An error occurred: {str(e)}",
            gr.update(value=[]),
            gr.update(interactive=False, placeholder="Error. Please reload.")
        )

def respond_to_chat(question, history, embeddings_model):
    """
    This is our chat function, called when the user hits Enter.
    (This function is correct, no changes needed)
    """

    if not os.path.exists(INDEX_NAME):
        history.append((question, "Error: Please process a video first."))
        return "", history

    try:
        print("Loading vector store from disk...")
        vector_store = FAISS.load_local(
            INDEX_NAME,
            embeddings_model,
            allow_dangerous_deserialization=True
        )

        rag_chain = create_rag_chain(vector_store)

        print(f"Invoking chain with question: {question}")
        response = rag_chain.invoke(question)
        print(f"Got response: {response}")

        history.append((question, response))

        return "", history

    except Exception as e:
        print(f"Error during chat: {e}")
        history.append((question, f"An error occurred: {str(e)}"))
        return "", history

In [None]:
# ---
# CELL 6: LAUNCHING THE GRADIO APPLICATION
# ---
# *** FIX ***: Removed the 'gr.Group' entirely.
# We will now update the 'chatbot' and 'msg_textbox' components directly.

print("Preparing to launch Gradio app...")

# Load the embedding model once when the app starts.
embeddings = download_embeddings()

# Define the Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# ðŸŽ¬ TubeTalk: Ask Questions About Any YouTube Video")
    gr.Markdown(
        "**Instructions:**\n"
        "1. Paste a YouTube URL (with English transcripts) into the box below.\n"
        "2. Click the 'Process Video' button.\n"
        "3. Wait for the 'Successfully processed' message.\n"
        "4. Ask your questions in the chat window!"
    )

    # Store the embeddings model in a 'State' variable
    embedding_state = gr.State(embeddings)

    # Section for Video Processing
    with gr.Group():
        with gr.Row():
            url_input = gr.Textbox(
                label="YouTube URL",
                placeholder="https://www.youtube.com/watch?v=..."
            )
            process_button = gr.Button("Process Video", variant="primary")

        status_output = gr.Markdown() # For success/error messages

    # Section for Chatting (Manual Build)
    # *** FIX ***: No more gr.Group wrapper
    chatbot = gr.Chatbot(
        label="Video Q&A",
        height=400
    )
    msg_textbox = gr.Textbox(
        label="Ask your question:",
        placeholder="Process a video first...",
        interactive=False # *** This is correct. Start as disabled ***
    )

    # --- Event Wiring ---

    # 1. When the 'Process Video' button is clicked:
    # *** FIX ***: Outputs now point to the 3 components
    process_button.click(
        fn=process_video,
        inputs=[url_input, embedding_state],
        outputs=[status_output, chatbot, msg_textbox]
    )

    # 2. When the user hits 'Enter' in the textbox:
    # (This was already correct)
    msg_textbox.submit(
        fn=respond_to_chat,
        inputs=[msg_textbox, chatbot, embedding_state],
        outputs=[msg_textbox, chatbot]
    )

print("Gradio app is ready. Launching...")
# We no longer need the 'chat_group.interactive = False' line
demo.launch(debug=True)

Preparing to launch Gradio app...
Downloading embedding model...
Embedding model downloaded.


  chatbot = gr.Chatbot(


Gradio app is ready. Launching...
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ab67f40f6f95d1b01e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Processing video: https://www.youtube.com/watch?v=d0XKtUXgpOw
Extracted Video ID: d0XKtUXgpOw
Loading transcript with YoutubeLoader...
Transcript split into 16 chunks.
Removing old index: faiss_video_index
Creating new vector store...
Vector store saved to disk as 'faiss_video_index'.


  state[block._id] = block.__class__(**kwargs)


Loading vector store from disk...
Invoking chain with question: Are stocks discussed in the video?
Got response: Yes, stocks are discussed in the video! The speaker mentions the value of all the stocks in the United States and even talks about owning a single stock.
Loading vector store from disk...
Invoking chain with question: are there dinosauruss in the video
Got response: I don't know.
Processing video: https://www.youtube.com/watch?v=d0XKtUXgpOw
Extracted Video ID: d0XKtUXgpOw
Loading transcript with YoutubeLoader...
Transcript split into 16 chunks.
Removing old index: faiss_video_index
Creating new vector store...
Vector store saved to disk as 'faiss_video_index'.


  state[block._id] = block.__class__(**kwargs)


Loading vector store from disk...
Invoking chain with question: Explain the video. Tell me the three main points of the video
Got response: This video features Warren Buffett, who is the chairman, CEO, and largest shareholder of Berkshire Hathaway and considered one of the world's most successful investors. It seems to be part of a "Mentor Me" series, aiming to share insights from highly successful individuals.

Here are three main points from the video:

1.  **Cash is a bad investment:** Warren Buffett states that cash is always a bad investment because it doesn't produce anything and is sure to go down in value over time. He'd much rather have good businesses than surplus cash.
2.  **Gold is not a productive asset:** He illustrates this by comparing the value of all the gold in the world ($7 trillion) to the value of productive assets like a third of all US stocks, all the Farmland in the US, seven Exxon Mobils, and a trillion dollars in cash. He emphasizes that gold just "shines" an