<a href="https://colab.research.google.com/github/beruscoder/gen-AI/blob/main/Copy_of_rag_using_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-proj-ORV1HwUGs8R8vWlaRYLdAzJfhxLH9NYWyb5GDGGm9Il4JLsPWQX5L1I8A9hR_Cbs1a0JWaPCtUdqvv5LRedP2mIWl8A"

## Install libraries

In [None]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

## Step 1a - Indexing (Document Ingestion)

In [None]:
video_id = "Gfr50f6ZBvo" # only the ID, not full URL
try:
    # If you don’t care which language, this returns the “best” one
    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

    # Flatten it to plain text
    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    print(transcript)

except TranscriptsDisabled:
    print("No captions available for this video.")

ParseError: no element found: line 1, column 0 (<string>)

In [None]:
transcript_list

## Step 1b - Indexing (Text Splitting)

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [None]:
len(chunks)

In [None]:
chunks[100]

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [None]:
vector_store.index_to_docstore_id

In [None]:
vector_store.get_by_ids(['2436bdb8-3f5f-49c6-8915-0c654c888700'])

## Step 2 - Retrieval

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
retriever

In [None]:
retriever.invoke('What is deepmind')

## Step 3 - Augmentation

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [None]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [None]:
question          = "is the topic of nuclear fusion discussed in this video? if yes then what was discussed"
retrieved_docs    = retriever.invoke(question)

In [None]:
retrieved_docs

In [None]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

In [None]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [None]:
final_prompt

## Step 4 - Generation

In [None]:
answer = llm.invoke(final_prompt)
print(answer.content)

## Building a Chain

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
parallel_chain.invoke('who is Demis')

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
main_chain.invoke('Can you summarize the video')