In [None]:
import os
import re
import logging
import time
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
import uuid
from typing import Dict, Optional, List
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
EDUCATION_LEVEL = "college"
COLLECTION_NAME = "youtube_videos"

# Validate environment
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY environment variable is not set.")

# Initialize Groq LLM
try:
    llm = ChatGroq(
        model_name="llama-3.3-70b-versatile",
        api_key=GROQ_API_KEY,
        temperature=0.7,
        max_tokens=200
    )
except Exception as e:
    logger.error(f"Failed to initialize Groq LLM: {e}")
    raise

# Initialize ChromaDB and embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
    persist_directory="./chroma_db"
)

def extract_video_id(url: str) -> Optional[str]:
    """Extract YouTube video ID from URL."""
    try:
        parsed_url = urlparse(url)
        if parsed_url.hostname in ["www.youtube.com", "youtube.com"]:
            query = parse_qs(parsed_url.query)
            return query.get("v", [None])[0]
        elif parsed_url.hostname in ["youtu.be"]:
            return parsed_url.path.lstrip("/")
        logger.warning(f"Invalid YouTube URL: {url}")
        return None
    except Exception as e:
        logger.error(f"Error extracting video ID from {url}: {e}")
        return None

def get_video_transcript(video_id: str) -> Optional[str]:
    """Fetch transcript for a YouTube video with retry."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([entry["text"] for entry in transcript])
    except Exception as e:
        logger.warning(f"Transcript unavailable for video {video_id}: {e}")
        return None

def scrape_video_description(url: str) -> Optional[str]:
    """Scrape video description using requests and BeautifulSoup."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find description in YouTube's metadata
        meta_description = soup.find("meta", {"name": "description"})
        if meta_description and meta_description.get("content"):
            return meta_description["content"]
        
        # Fallback to description in page content
        description_div = soup.find("div", {"id": "description"})
        if description_div:
            return description_div.get_text(strip=True)
        
        return "No description available."
    except Exception as e:
        logger.error(f"Failed to scrape description for {url}: {e}")
        return None

def summarize_content(content: str, video_url: str) -> str:
    """Summarize transcript or description using Groq LLM."""
    if not content or content in ["No transcript available.", "No description available."]:
        return f"No content available for summarization of video {video_url}."
    
    prompt = f"""
    Summarize the following content in 150-200 words. Focus on key concepts, examples, and explanations for a {EDUCATION_LEVEL} student. Keep it concise and clear.

    Content: {content[:4000]}  # Truncate to avoid token limits
    """
    try:
        response = llm.invoke(prompt)
        return response.content.strip()
    except Exception as e:
        logger.error(f"Failed to summarize content: {e}")
        return f"Failed to summarize video due to error: {e}"

def store_in_vector_db(
    video_url: str, transcript: str, summary: str, metadata: Dict
) -> None:
    """Store video data in ChromaDB."""
    try:
        doc_id = str(uuid.uuid4())
        document = Document(
            page_content=f"Video URL: {video_url}\nTranscript: {transcript}\nSummary: {summary}",
            metadata=metadata,
            id=doc_id
        )
        vector_store.add_documents([document])
        logger.info(f"Stored data for video {video_url} in vector database.")
    except Exception as e:
        logger.error(f"Failed to store data in vector database: {e}")
        raise

def query_vector_db(query: str, top_k: int = 1) -> List[Dict]:
    """Retrieve relevant information from the vector database."""
    try:
        results = vector_store.similarity_search_with_score(query, k=top_k)
        responses = []
        for doc, score in results:
            responses.append({
                "video_url": doc.metadata.get("video_url", "N/A"),
                "title": doc.metadata.get("title", "Unknown"),
                "transcript": doc.page_content.split("Transcript: ")[1].split("\nSummary: ")[0],
                "summary": doc.page_content.split("Summary: ")[1],
                "score": score
            })
        return responses
    except Exception as e:
        logger.error(f"Vector database query failed: {e}")
        return []

def process_video(video_url: str, title: str = "Unknown") -> Dict:
    """Process a YouTube video: generate transcript, summarize, and store."""
    try:
        # Extract video ID
        video_id = extract_video_id(video_url)
        if not video_id:
            return {
                "video_url": video_url,
                "transcript": "Invalid YouTube URL",
                "summary": "No summary generated due to invalid URL",
                "stored": False
            }
        
        # Try fetching transcript
        transcript = get_video_transcript(video_id)
        content = transcript
        
        # Fallback to description if transcript is unavailable
        if not transcript:
            logger.info(f"Falling back to video description for {video_url}")
            content = scrape_video_description(video_url) or "No description available."
        
        # Summarize content
        summary = summarize_content(content, video_url)
        
        # Prepare metadata
        metadata = {
            "video_url": video_url,
            "title": title,
            "education_level": EDUCATION_LEVEL
        }
        
        # Store in vector database
        store_in_vector_db(video_url, content, summary, metadata)
        
        return {
            "video_url": video_url,
            "transcript": content,
            "summary": summary,
            "stored": True
        }
    except Exception as e:
        logger.error(f"Error processing video {video_url}: {e}")
        return {
            "video_url": video_url,
            "transcript": f"Error generating transcript: {e}",
            "summary": f"Error generating summary: {e}",
            "stored": False
        }

def main():
    """Main function to run the YouTube video summarizer."""
    while True:
        video_url = input("Enter a YouTube video URL (or 'quit' to exit): ").strip()
        if video_url.lower() == "quit":
            break
        
        title = input("Enter a title for the video (optional, press Enter to skip): ").strip() or "Unknown"
        
        # Process the video
        result = process_video(video_url, title)
        print(f"""
        ▶️ Video URL: {result['video_url']}

        📜 Transcript:
        {result['transcript'][:500] + '...' if len(result['transcript']) > 500 else result['transcript']}

        📝 Summary:
        {result['summary']}

        💾 Stored in vector DB: {'✅' if result['stored'] else '❌'}
        """)
        
        # Query the database
        query = input("Ask a question about the video (or 'skip' to continue): ").strip()
        if query.lower() != "skip":
            responses = query_vector_db(query)
            if responses:
                print("\n🔍 Query Results:")
                for resp in responses:
                    print(f"""
                    Video URL: {resp['video_url']}
                    Title: {resp['title']}
                    Transcript (excerpt): {resp['transcript'][:500] + '...' if len(resp['transcript']) > 500 else resp['transcript']}
                    Summary: {resp['summary']}
                    Relevance Score: {resp['score']:.2f}
                    """)
            else:
                print("No relevant information found in the database.")
        
        # Rate limiting: Wait to avoid IP blocks
        time.sleep(2)

if __name__ == "__main__":
    main()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
  vector_store = Chroma(
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=ZhAz268Hdpw! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There 


        ▶️ Video URL: https://www.youtube.com/watch?v=ZhAz268Hdpw&t=1s

        📜 Transcript:
        Enjoy the videos and music that you love, upload original content and share it all with friends, family and the world on YouTube.

        📝 Summary:
        YouTube is a platform where users can access and share various forms of content, including videos and music. The key concept is that it allows users to upload their original content, making it a hub for user-generated media. This can include vlogs, educational videos, music covers, and more. Users can share their uploaded content with friends, family, and a global audience, making it a powerful tool for self-expression and communication.

For example, a college student can create and upload a video about their academic experiences, sharing tips and advice with peers. They can also share their favorite music videos or create their own music content, such as covers or original songs. The platform's vast reach and accessibility make




        ▶️ Video URL: 

        📜 Transcript:
        Invalid YouTube URL

        📝 Summary:
        No summary generated due to invalid URL

        💾 Stored in vector DB: ❌
        

🔍 Query Results:

                    Video URL: https://www.youtube.com/watch?v=ZhAz268Hdpw&t=1s
                    Title: Unknown
                    Transcript (excerpt): Enjoy the videos and music that you love, upload original content and share it all with friends, family and the world on YouTube.
                    Summary: YouTube is a platform where users can access and share various forms of content, including videos and music. The key concept is that it allows users to upload their original content, making it a hub for user-generated media. This can include vlogs, educational videos, music covers, and more. Users can share their uploaded content with friends, family, and a global audience, making it a powerful tool for self-expression and communication.

For example, a college student can c




        ▶️ Video URL: 

        📜 Transcript:
        Invalid YouTube URL

        📝 Summary:
        No summary generated due to invalid URL

        💾 Stored in vector DB: ❌
        

🔍 Query Results:

                    Video URL: https://www.youtube.com/watch?v=ZhAz268Hdpw&t=1s
                    Title: Unknown
                    Transcript (excerpt): Enjoy the videos and music that you love, upload original content and share it all with friends, family and the world on YouTube.
                    Summary: YouTube is a platform where users can access and share various forms of content, including videos and music. The key concept is that it allows users to upload their original content, making it a hub for user-generated media. This can include vlogs, educational videos, music covers, and more. Users can share their uploaded content with friends, family, and a global audience, making it a powerful tool for self-expression and communication.

For example, a college student can c




        ▶️ Video URL: 

        📜 Transcript:
        Invalid YouTube URL

        📝 Summary:
        No summary generated due to invalid URL

        💾 Stored in vector DB: ❌
        

🔍 Query Results:

                    Video URL: https://www.youtube.com/watch?v=ZhAz268Hdpw&t=1s
                    Title: Unknown
                    Transcript (excerpt): Enjoy the videos and music that you love, upload original content and share it all with friends, family and the world on YouTube.
                    Summary: YouTube is a platform where users can access and share various forms of content, including videos and music. The key concept is that it allows users to upload their original content, making it a hub for user-generated media. This can include vlogs, educational videos, music covers, and more. Users can share their uploaded content with friends, family, and a global audience, making it a powerful tool for self-expression and communication.

For example, a college student can c

In [1]:
pip install google-generativeai requests

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.175.0-py3-none-any.whl.metadata (7.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting grpcio-status<2.0.0,>=1.33.2 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.1->google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading grpcio_status-1.73.1-py3-none-any.whl.meta

In [None]:
from langchain_groq import ChatGroq
GROQ_API_KEY=os.getenv("GROQ_API_KEY")
llm=ChatGroq(model='gemma')

  from .autonotebook import tqdm as notebook_tqdm


Attempting to retrieve transcript...
Error fetching video details from YouTube API: 400 Client Error: Bad Request for url: https://www.googleapis.com/youtube/v3/videos?part=snippet%2CcontentDetails&id=wvANnvfOKV4&key=YOUR_YOUTUBE_DATA_API_KEY
Error generating transcript with LLM: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
Failed to retrieve transcript.


In [3]:
from llm import llm


In [4]:
llm.invoke("https://www.youtube.com/watch?v=wvANnvfOKV4  refer this link and generate the transcript")

AIMessage(content="I'm a large language model, I don't have the capability to directly access external links, including YouTube videos. However, I can guide you through the process of generating a transcript from the video.\n\nTo generate a transcript, you can follow these steps:\n\n1. **Copy the video URL**: Copy the URL of the YouTube video you provided: `https://www.youtube.com/watch?v=wvANnvfOKV4`\n2. **Use a transcription tool**: You can use online transcription tools such as:\n\t* YouTube's built-in auto-transcription feature (not always available)\n\t* Rev.com\n\t* GoTranscript\n\t* Trint\n\t* Otter.ai\n3. **Paste the video URL**: Paste the video URL into the transcription tool of your choice.\n4. **Generate the transcript**: Follow the tool's instructions to generate the transcript. This may involve selecting the language, speaker identification, and other options.\n5. **Copy and paste the transcript**: Once the transcript is generated, you can copy and paste it into a text edi