In [None]:
# An OpenAI API key is needed

In [None]:
!pip install -q youtube-transcript-api langchain-community langchain-openai faiss-cpu tiktoken python-dotenv

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

In [None]:
# Step 1a - Indexing (Document ingestion)
video_id = "Gfr50f6ZBvo"
try:
    # Get the transcript
    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    print(transcript)
except TranscriptsDisabled:
    print("No caption available")


In [None]:
transcript_list

In [None]:
# Step 1b - Indexing (Text splitting)
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [None]:
chunks[0]

In [None]:
# Step 1c & 1d - Indexing (Embedding Generation and Storing in a Vector Store)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [None]:
vector_store.index_to_docstore_id

In [None]:
# Step 2 - Retrieval
retriever = vector_store.as_retriever(search="similarity", search_kwargs={"k":4})

In [None]:
retriever

In [None]:
retriever.invoke("What is deepmind")

In [None]:
# Step 3 - Augmentation
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [None]:
prompt = PromptTemplate(
    template="""
    You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables=["context", "question"]
)

In [None]:
question = "is the topic of nuclear fussion discussed in video? if yes then what was discussed"
retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs

In [None]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [None]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [None]:
# Step 4 - Generation
answer = llm.invoke(final_prompt)
print(answer.content)

In [None]:
# Building a chain
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough
})

In [None]:
parallel_chain.invoke('who is Demis')

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
main_chain.invoke('Can you summarize the video')