In [None]:
import bs4, tiktoken, numpy as np, os, datetime
from typing import Literal, Optional, Tuple
from youtube_transcript_api import YouTubeTranscriptApi

from pydantic import BaseModel, Field
from langchain import hub
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from dotenv import load_dotenv
load_dotenv()

learn_api_key = os.environ['LANGSMITH_API_KEY']
openai_api_key = os.environ['OPENAI_API_KEY']
groq_api_key = os.environ['GROQ_API_KEY']

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

video_id = "a8FTr2qMutA"
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
docs = " ".join([t["text"] for t in transcript])
print(docs[:500])

## Part 1: Overview

In [None]:
### INDEXING ###

# Load Documents
video_id = "Ks-_Mh1QhMc"
transcript = YouTubeTranscriptApi.get_transcript(video_id)
docs = " ".join([t["text"] for t in transcript])
docs[:500]

In [None]:
class TutorialSearch(BaseModel):
    """Search over a database of tutorial videos about a software library."""
    content_search: str = Field(
        ...,
        description="Similarity search query applied to video transcripts."
    )
    title_search: str = Field(
        ...,
        description=(
        "Alternate version of the content search query to apply to video titles. "
        "Should be succint and only include key words that could be in a video title."
        ),
    )
    min_view_count: Optional[int] = Field(
        None,
        description="Minimum view count filter, inclusive. Only use if explicitly specified."
    )
    max_view_count: Optional[int] = Field(
        None,
        description="Maximum view count filter, exclusive. Only use if explicitly specified."
    )
    earliest_publish_date: Optional[int] = Field(
        None,
        description="Earliest publish date filter, inclusive. Only use if explicitly specified."
    )
    latest_publish_date: Optional[int] = Field(
        None,
        description="Latest publish date filter, exclusive. Only use if explicitly specified."
    )
    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified."
    )
    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, exclusive. Only use if explicitly specified."
    )

    def pretty_print(self) -> None:
        for field in TutorialSearch.model_fields:
            if getattr(self, field) is not None and getattr(self, field) != getattr(TutorialSearch.model_fields[field], "default", None):
                print(f"{field}: {getattr(self, field)}")

In [None]:
system = """You are an expert at converting user questions into database queries. \
    You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
    Given a question, return a database query optimized to retrieve the most relevant results.
    
    If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}")
    ]
)

llm = ChatGroq(model="gemma2-9b-it", api_key=groq_api_key, temperature=0)
structured_llm = llm.with_structured_output(TutorialSearch)
query_analyzer = prompt | structured_llm

In [None]:
query_analyzer.invoke({"question":"rag from scratch"}).pretty_print()

In [None]:
query_analyzer.invoke(
    {"question": "videos on chat langchain published in 2023"}
).pretty_print()

In [None]:
query_analyzer.invoke(
    {"question": "videos that are focused on the topic of chat langchain that are published before 2024"}
).pretty_print()

In [None]:
query_analyzer.invoke(
    {"question": "how to use mutli-modal models in an agent, only videos under 5 minutes"}
).pretty_print()