## üì¶ Step 1: Install Required Packages

In [None]:
!pip install -q youtube-transcript-api
!pip install -q langchain
!pip install -q langchain-openai
!pip install -q langchain-huggingface
!pip install -q langchain-community
!pip install -q langchain-text-splitters
!pip install -q langchain-chroma
!pip install -q chromadb
!pip install -q openai
!pip install -q gradio
!pip install -q huggingface_hub
!pip install -q sentence-transformers
!pip install -q torch
print("‚úÖ All packages installed successfully!")

## üîë Step 2: Choose AI Provider & Set API Keys

### Option A: OpenAI (Paid - Best Quality)
- **Cost:** ~$0.0004 per 1K tokens (~$0.02 per video)
- **Models:** GPT-3.5-turbo, text-embedding-ada-002
- **Get key:** https://platform.openai.com/api-keys

### Option B: HuggingFace (FREE! üéâ)
- **Cost:** Completely free!
- **Models:** Mistral-7B-Instruct, all-MiniLM-L6-v2
- **Get token:** https://huggingface.co/settings/tokens

**Change `AI_PROVIDER` below to your choice:**

In [None]:
import os

# ========================================
# CHOOSE YOUR AI PROVIDER HERE
# ========================================
AI_PROVIDER = "HuggingFace"  # Options: "OpenAI" or "HuggingFace"
# ========================================

print(f"ü§ñ Selected AI Provider: {AI_PROVIDER}\n")

try:
    from google.colab import userdata
    use_secrets = True
except:
    use_secrets = False

if AI_PROVIDER == "OpenAI":
    print("üìù OpenAI Setup")
    print("Get your API key from: https://platform.openai.com/api-keys\n")
    
    if use_secrets:
        try:
            OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
            print("‚úÖ OpenAI API key loaded from Colab Secrets")
        except:
            OPENAI_API_KEY = input("Enter your OpenAI API key: ")
            print("‚úÖ OpenAI API key entered")
    else:
        OPENAI_API_KEY = input("Enter your OpenAI API key: ")
        print("‚úÖ OpenAI API key entered")
    
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
    
elif AI_PROVIDER == "HuggingFace":
    print("üìù HuggingFace Setup (FREE!)")
    print("Get your token from: https://huggingface.co/settings/tokens\n")
    
    if use_secrets:
        try:
            HF_TOKEN = userdata.get('HF_TOKEN')
            print("‚úÖ HuggingFace token loaded from Colab Secrets")
        except:
            HF_TOKEN = input("Enter your HuggingFace token: ")
            print("‚úÖ HuggingFace token entered")
    else:
        HF_TOKEN = input("Enter your HuggingFace token: ")
        print("‚úÖ HuggingFace token entered")
    
    os.environ['HUGGINGFACEHUB_API_TOKEN'] = HF_TOKEN

print(f"\n‚úÖ {AI_PROVIDER} configured successfully!")

## üìö Step 3: Import Libraries

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain.chains.retrieval_qa.base import RetrievalQA
import json

# Import provider-specific libraries
if AI_PROVIDER == "OpenAI":
    from langchain_openai import OpenAIEmbeddings, ChatOpenAI
    print("‚úÖ OpenAI libraries imported")
elif AI_PROVIDER == "HuggingFace":
    from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
    from huggingface_hub import InferenceClient
    print("‚úÖ HuggingFace libraries imported")

print("‚úÖ All libraries loaded successfully!")

## üé¨ Step 4: YouTube Transcript Fetcher

In [None]:
class YouTubeTranscriptFetcher:
    """Fetches YouTube video transcripts"""
    
    @staticmethod
    def extract_video_id(url: str) -> str:
        """Extract video ID from YouTube URL"""
        if "youtube.com" in url or "youtu.be" in url:
            if "v=" in url:
                return url.split("v=")[1].split("&")[0]
            elif "youtu.be/" in url:
                return url.split("youtu.be/")[1].split("?")[0]
        return url  # Already a video ID
    
    def fetch_transcript(self, video_id: str) -> dict:
        """Fetch transcript for a single video"""
        video_id = self.extract_video_id(video_id)
        
        try:
            # Get transcript
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
            
            # Combine all text
            full_text = " ".join([entry['text'] for entry in transcript_list])
            
            return {
                'video_id': video_id,
                'transcript': full_text,
                'segments': transcript_list,
                'length': len(full_text)
            }
        except TranscriptsDisabled:
            raise Exception(f"‚ùå Transcripts are disabled for video: {video_id}")
        except NoTranscriptFound:
            raise Exception(f"‚ùå No transcript found for video: {video_id}")
        except Exception as e:
            raise Exception(f"‚ùå Error: {str(e)}")
    
    def fetch_multiple(self, video_ids: list) -> list:
        """Fetch transcripts for multiple videos"""
        transcripts = []
        print(f"\nüì• Fetching {len(video_ids)} video(s)...\n")
        
        for i, video_id in enumerate(video_ids, 1):
            print(f"[{i}/{len(video_ids)}] Processing: {video_id}")
            try:
                transcript = self.fetch_transcript(video_id)
                transcripts.append(transcript)
                chars = transcript['length']
                print(f"  ‚úÖ Success! Got {chars:,} characters\n")
            except Exception as e:
                print(f"  {str(e)}\n")
        
        return transcripts

print("‚úÖ Transcript fetcher ready")

## üéØ Step 5: Add Your YouTube Videos

Enter video IDs or full URLs (comma-separated)

**Examples:**
- `dQw4w9WgXcQ`
- `https://www.youtube.com/watch?v=dQw4w9WgXcQ`
- `jNQXAC9IVRw, 9bZkp7q19f0`

In [None]:
# Enter your video IDs here (or leave blank to input manually)
VIDEO_IDS = [
    # Add video IDs here, for example:
    # "dQw4w9WgXcQ",
    # "jNQXAC9IVRw",
]

# Manual input if list is empty
if not VIDEO_IDS:
    manual_input = input("Enter YouTube video IDs (comma-separated): ").strip()
    if manual_input:
        VIDEO_IDS = [v.strip() for v in manual_input.split(',')]

if not VIDEO_IDS:
    print("‚ùå No video IDs provided. Please add videos in the cell above.")
else:
    # Fetch transcripts
    fetcher = YouTubeTranscriptFetcher()
    transcripts = fetcher.fetch_multiple(VIDEO_IDS)
    
    if transcripts:
        total_chars = sum(t['length'] for t in transcripts)
        print(f"\n‚úÖ Successfully fetched {len(transcripts)} transcript(s)!")
        print(f"üìä Total: {total_chars:,} characters")
    else:
        print("\n‚ùå No transcripts were fetched successfully.")
        print("üí° Tip: Make sure videos have captions enabled!")

## ‚úÇÔ∏è Step 6: Create Text Chunks

In [None]:
if not transcripts:
    print("‚ùå No transcripts available. Please run Step 5 again.")
else:
    # Create LangChain documents
    documents = []
    for transcript in transcripts:
        doc = Document(
            page_content=transcript['transcript'],
            metadata={
                'video_id': transcript['video_id'],
                'url': f"https://www.youtube.com/watch?v={transcript['video_id']}"
            }
        )
        documents.append(doc)
    
    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    
    chunks = text_splitter.split_documents(documents)
    
    print(f"‚úÖ Created {len(chunks)} text chunks")
    print(f"üìä Average chunk size: {sum(len(c.page_content) for c in chunks) // len(chunks)} characters")

## üóÑÔ∏è Step 7: Create Vector Database with Embeddings

This creates embeddings for semantic search.

In [None]:
if not chunks:
    print("‚ùå No chunks available. Please run Step 6 again.")
else:
    print(f"üîÑ Creating embeddings using {AI_PROVIDER}...")
    print("‚è≥ This may take 1-3 minutes...\n")
    
    # Create embeddings based on provider
    if AI_PROVIDER == "OpenAI":
        embeddings = OpenAIEmbeddings(
            model="text-embedding-ada-002"
        )
        print("Using OpenAI text-embedding-ada-002")
        
    elif AI_PROVIDER == "HuggingFace":
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        print("Using HuggingFace all-MiniLM-L6-v2 (free!)")
    
    # Create vector store
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory="./chroma_db"
    )
    
    print(f"\n‚úÖ Vector database created!")
    print(f"üìä {len(chunks)} chunks embedded and indexed")

## ü§ñ Step 8: Create RAG Chatbot

In [None]:
if not vectorstore:
    print("‚ùå Vector database not created. Please run Step 7 again.")
else:
    print(f"üîÑ Setting up {AI_PROVIDER} chat model...\n")
    
    # Create LLM based on provider
    if AI_PROVIDER == "OpenAI":
        llm = ChatOpenAI(
            model="gpt-3.5-turbo",
            temperature=0.7
        )
        print("Using GPT-3.5-turbo")
        
    elif AI_PROVIDER == "HuggingFace":
        llm = HuggingFaceEndpoint(
            repo_id="mistralai/Mistral-7B-Instruct-v0.2",
            temperature=0.7,
            max_new_tokens=512,
            huggingfacehub_api_token=os.environ.get('HUGGINGFACEHUB_API_TOKEN')
        )
        print("Using Mistral-7B-Instruct-v0.2 (free!)")
    
    # Create retrieval QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(
            search_kwargs={"k": 4}
        ),
        return_source_documents=True
    )
    
    print(f"\n‚úÖ RAG Chatbot ready!")
    print(f"üí¨ You can now ask questions about your {len(transcripts)} video(s)")

## üí¨ Step 9: Chat Function

Use `chat("your question")` to ask questions

In [None]:
def chat(question: str):
    """Ask a question about your videos"""
    if not qa_chain:
        print("‚ùå Chatbot not initialized. Please run Step 8.")
        return
    
    print(f"\n‚ùì Question: {question}\n")
    print("ü§î Thinking...\n")
    
    try:
        result = qa_chain({"query": question})
        
        print(f"üí¨ Answer:\n{result['result']}\n")
        
        # Show sources
        if result.get('source_documents'):
            print("\nüìö Sources:")
            seen_videos = set()
            for i, doc in enumerate(result['source_documents'], 1):
                video_id = doc.metadata.get('video_id', 'Unknown')
                if video_id not in seen_videos:
                    seen_videos.add(video_id)
                    print(f"  ‚Ä¢ Video: {video_id}")
                    print(f"    URL: https://www.youtube.com/watch?v={video_id}")
        
        return result['result']
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return None

print("‚úÖ Chat function ready!")
print("\nüí° Usage: chat('What is this video about?')")

## üéØ Step 10: Test Chat (Examples)

Try asking questions!

In [None]:
# Example 1: General question
chat("What is this video about?")

In [None]:
# Example 2: Summarization
chat("Summarize the main points in 3 bullet points")

In [None]:
# Ask your own question
question = input("Your question: ")
if question:
    chat(question)

## üé® Step 11: Interactive UI with Gradio (Optional)

Launch a beautiful chat interface!

In [None]:
import gradio as gr

def gradio_chat(message, history):
    """Gradio chat interface"""
    if not qa_chain:
        return "‚ùå Chatbot not initialized. Please run all previous steps."
    
    try:
        result = qa_chain({"query": message})
        
        # Build response with sources
        response = result['result']
        
        if result.get('source_documents'):
            response += "\n\n---\n**üìö Sources:**\n"
            seen_videos = set()
            for doc in result['source_documents'][:3]:
                video_id = doc.metadata.get('video_id', 'Unknown')
                if video_id not in seen_videos:
                    seen_videos.add(video_id)
                    response += f"- [‚ñ∂Ô∏è {video_id}](https://www.youtube.com/watch?v={video_id})\n"
        
        return response
        
    except Exception as e:
        return f"‚ùå Error: {str(e)}"

# Create Gradio interface
demo = gr.ChatInterface(
    fn=gradio_chat,
    title=f"üé• YouTube RAG Chatbot ({AI_PROVIDER})",
    description=f"Ask questions about {len(transcripts)} YouTube video(s) ‚Ä¢ Powered by {AI_PROVIDER}",
    examples=[
        "What is the main topic of the video?",
        "Summarize the key points",
        "What are the most important details?",
        "Explain this in simple terms"
    ],
    theme=gr.themes.Soft()
)

# Launch with public link
print("üöÄ Launching Gradio interface...\n")
demo.launch(share=True, debug=False)

## üéâ Congratulations!

Your YouTube RAG Chatbot is now running!

### ‚úÖ What You Can Do:
- **Chat in cells:** Use `chat("your question")` in any code cell
- **Use Gradio UI:** Click the public link above for a web interface
- **Add more videos:** Go back to Step 5 and add new video IDs
- **Switch providers:** Change `AI_PROVIDER` in Step 2 and re-run

### üí° Tips:
- Videos must have captions/transcripts enabled
- HuggingFace is free but slower than OpenAI
- The more videos you add, the more knowledge the bot has
- Try educational content, tutorials, or lectures for best results

### üîÑ To Add More Videos:
1. Go to **Step 5**
2. Add new video IDs
3. Re-run Steps 5-11

### üìä Performance:
- **OpenAI:** Fast responses (~2-5 seconds), costs ~$0.02 per video
- **HuggingFace:** Free, slower responses (~10-30 seconds)

---

**Enjoy chatting with your YouTube videos! üé¨üí¨**