# RAG application (Youtube Transcript)


In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# Using OpenAI
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Using the local model
MODEL = "gemma3"

# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=9vM4p9NN0Ts&t=1s"

## Setting up the model

If using the OpenAI model

In [None]:
# from langchain_openai import ChatOpenAI

# model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo", temperature=0)

If using a local model

In [2]:
from langchain_community.llms import Ollama

model = Ollama(model="gemma3", temperature=0)

Testing

In [3]:
model.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

'The **Los Angeles Dodgers** won the World Series during the 2020 MLB season, which was significantly impacted by the COVID-19 pandemic. \n\nThe 2020 season was played in a bubble environment at Spring Training facilities in Arizona.'

The result from the model is an `AIMessage` instance containing the answer. We can extract this answer by chaining the model with an [output parser](https://python.langchain.com/docs/how_to/#output-parsers).

For this example, we'll use a simple `StrOutputParser` to extract the answer as a string.

In [4]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

'The **Los Angeles Dodgers** won the World Series during the 2020 MLB season, which was significantly impacted by the COVID-19 pandemic. \n\nThe 2020 season was played in a bubble environment at Spring Training facilities in Arizona.'

## Using prompt templates

In [5]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n'

In [6]:
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana.'

## Combining chains

We can combine different chains to create more complex workflows. Creation of a second chain that translates the answer from the first chain into a different language.

In [7]:
# Translation_template = f"""
# Translate {answer} to {language}
# """

# translation_prompt = ChatPromptTemplate.from_template(Translation_template)


translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

We can now create a new translation chain that combines the result from the first chain with the translation prompt.

In [8]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "Mary's sister is Susana. She doesn't have any more siblings.",
        "question": "Count the number of sisters does Mary have?",
        "language": "Spanish",
    }
)

'The translation of "One" to Spanish depends on the context. Here are the most common translations:\n\n*   **Uno:** This is the most common and general translation for "one" as a number.\n*   **Un:** This is the masculine singular indefinite article ("a" or "an") and is often used when referring to "one person."  For example, "Un hombre" (One man).\n\n**Therefore, the best translation is usually "Uno".**\n\nCould you provide more context if you\'d like a more specific translation?'

## Transcribing the YouTube Video

The context we want to send the model comes from a YouTube video. Let's download the video and transcribe it using [OpenAI's Whisper](https://openai.com/research/whisper)/ Langchain's youtube_transcript_api/ Youtube loader.

For OpenAI

In [14]:
# import tempfile
# import whisper
# from pytube import YouTube


# # Let's do this only if we haven't created the transcription file yet.
# if not os.path.exists("transcription.txt"):
#     youtube = YouTube(YOUTUBE_VIDEO)
#     audio = youtube.streams.filter(only_audio=True).first()

#     # Let's load the base model. This is not the most accurate
#     # model but it's fast.
#     whisper_model = whisper.load_model("base")

#     with tempfile.TemporaryDirectory() as tmpdir:
#         file = audio.download(output_path=tmpdir)
#         transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

#         with open("transcription.txt", "w") as file:
#             file.write(transcription)

From langchain youtube_transcript_api

In [None]:
# from youtube_transcript_api import YouTubeTranscriptApi

# def extract_video_id(url):
#     if "watch?v=" in url:
#         return url.split("watch?v=")[1].split("&")[0]
#     if "youtu.be/" in url:
#         return url.split("youtu.be/")[1].split("?")[0]
#     return None

# video_id = extract_video_id(YOUTUBE_VIDEO)

# print("Video ID:", video_id)

# try:
#     transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
#     full_text = " ".join([seg["text"] for seg in transcript_data])

#     with open("transcription.txt", "w", encoding="utf-8") as f:
#         f.write(full_text)

#     print("Transcript saved successfully!")

# except Exception as e:
#     print("Error:", e)

# loader = YoutubeLoader.from_youtube_url(YOUTUBE_VIDEO, add_video_info=False)
# docs = loader.load()

# # Save to transcription.txt
# transcription = "\n".join([doc.page_content for doc in docs])

# with open("transcription.txt", "w", encoding="utf-8") as f:
#     f.write(transcription)

# print("Transcript saved to transcription.txt")

Using Youtubeloader

In [None]:
# from langchain_community.document_loaders import YoutubeLoader

# loader = YoutubeLoader.from_youtube_url(
#     YOUTUBE_VIDEO,
#     add_video_info=False,
#     language=["en"],   # request English subtitles
# )

# docs = loader.load()

# transcription = "\n".join([doc.page_content for doc in docs])

# with open("transcription.txt", "w", encoding="utf-8") as f:
#     f.write(transcription)

# print("Transcript saved successfully!")


Due to compatibility issue youtube Transcript is generated from outside

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
print(dir(YouTubeTranscriptApi))

# No attributes like 'get_transcript', 'get_transcripts', 'list_transcripts'

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'fetch', 'list']


Accessing the Transcript.txt file directly

In [25]:
with open("Transcript.txt") as file:
    transcription = file.read()

transcription[:100]

"So, let's get started. So I'll be talking about building LLMs today. So I think a lot of you have he"

## Using the entire transcription as context

If we try to invoke the chain using the transcription as context, the model will return an error because the context is too long.

Large Language Models support limitted context sizes. The video we are using is too long for the model to handle, so we need to find a different solution.

In [26]:
try:
    chain.invoke({
        "context": transcription,
        "question": "Is reading papers a good idea?"
    })
except Exception as e:
    print(e)

## Splitting the transcription

Since we can't use the entire transcription in case of large transcript in OpenAI models as the context for the model, a potential solution is to split the transcription into smaller chunks. We can then invoke the model using only the relevant chunks to answer a particular question:

In [27]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("Transcript.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'Transcript.txt'}, page_content='So, let\'s get started. So I\'ll be talking about building LLMs today. So I think a lot of you have heard of LLMs before, but just as a quick recap. LLMs standing for large language models are basically all the chat bots that you\'ve been hearing about recently. So, ChatGPT, from OpenAI, Claude, from Anthropic, Gemini and Llama, and other types of models like this. And today we\'ll be talking about how do they actually work. So it\'s going to be an overview because it\'s only one lecture and it\'s hard to compress everything. But hopefully, I\'ll touch a little bit about all the components that are needed to train some of these LLMs. Also, if you have questions, please interrupt me and ask if you have a question. Most likely other people in the room or on Zoom have other. Have the same questions. So, please ask. Great. So what matters when training LLMs. So there are a few key components that matter. One is the architecture

There are many different ways to split a document. For this example, we'll use a simple splitter that splits the document into chunks of a fixed size.

For illustration purposes, let's split the transcription into chunks of 100 characters with an overlap of 20 characters and display the first few chunks:

In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)
documents

[Document(metadata={'source': 'Transcript.txt'}, page_content="So, let's get started. So I'll be talking about building LLMs today. So I think a lot of you have heard of LLMs before, but just as a quick recap. LLMs standing for large language models are basically all the chat bots that you've been hearing about recently. So, ChatGPT, from OpenAI, Claude, from Anthropic, Gemini and Llama, and other types of models like this. And today we'll be talking about how do they actually work. So it's going to be an overview because it's only one lecture and it's hard to compress everything. But hopefully, I'll touch a little bit about all the components that are needed to train some of these LLMs. Also, if you have questions, please interrupt me and ask if you have a question. Most likely other people in the room or on Zoom have other. Have the same questions. So, please ask. Great. So what matters when training LLMs. So there are a few key components that matter. One is the architecture. So as 

## Finding the relevant chunks

Given a particular question, we need to find the relevant chunks from the transcription to send to the model. Here is where the idea of **embeddings** comes into play.

To provide with the most relevant chunks, we can use the embeddings of the question and the chunks of the transcription to compute the similarity between them. We can then select the chunks with the highest similarity to the question and use them as the context for the model:

In [33]:
# from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings


# Using the OpenAI model embeddings
# embeddings = OpenAIEmbeddings()
# embedded_query = embeddings.embed_query("Who is Mary's sister?")

# Using the local model embeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")
embedded_query = embeddings.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 768
[1.8528720140457153, -0.004568893928080797, -3.836364507675171, -0.2580825090408325, -1.0327272415161133, 0.5219928026199341, -1.4719003438949585, -0.8365224003791809, 0.1617519110441208, -0.0577859990298748]


Just for an exmaple let's first generate the embeddings for two different sentences:

In [34]:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

We can now compute the similarity between the query and each of the two sentences using the cosine similarity. The closer the embeddings are, the more similar the sentences will be.

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.8915645388794463, 0.5288407885148655)

## Setting up a Vector Store

We need an efficient way to store document chunks, their embeddings, and perform similarity searches at scale. To do this, we'll use a **vector store**.

In [37]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore1 = DocArrayInMemorySearch.from_texts(
    [
        "Mary's sister is Susana",
        "John and Tommy are brothers",
        "Patricia likes white cars",
        "Pedro's mother is a teacher",
        "Lucia drives an Audi",
        "Mary has two siblings",
    ],
    embedding=embeddings,
)

We can now query the vector store to find the most similar embeddings to a given query:

In [38]:
vectorstore1.similarity_search_with_score(query="Who is Mary's sister?", k=3)

[(Document(page_content="Mary's sister is Susana"), 0.7888652762156009),
 (Document(page_content='Mary has two siblings'), 0.7856958112152003),
 (Document(page_content='John and Tommy are brothers'), 0.5587133198898103)]

## Connecting the vector store to the chain

We can use the vector store to find the most relevant chunks from the transcription to send to the model.

We need to configure a [Retriever](https://python.langchain.com/docs/how_to/#retrievers). The retriever will run a similarity search in the vector store and return the most similar documents back to the next step in the chain.

In [39]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Who is Mary's sister?")

[Document(page_content="Mary's sister is Susana"),
 Document(page_content='Mary has two siblings'),
 Document(page_content='John and Tommy are brothers'),
 Document(page_content="Pedro's mother is a teacher")]

Our prompt expects two parameters, "context" and "question." We can use the retriever to find the chunks we'll use as the context to answer the question.

We can create a map with the two inputs by using the [`RunnableParallel`](https://python.langchain.com/docs/how_to/parallel/) and [`RunnablePassthrough`](https://python.langchain.com/docs/how_to/passthrough/) classes. This will allow us to pass the context and question to the prompt as a map with the keys "context" and "question."

In [40]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("What color is Patricia's car?")

{'context': [Document(page_content='Patricia likes white cars'),
  Document(page_content='Lucia drives an Audi'),
  Document(page_content="Pedro's mother is a teacher"),
  Document(page_content="Mary's sister is Susana")],
 'question': "What color is Patricia's car?"}

Let's now add the setup map to the chain and run it:



In [41]:
chain = setup | prompt | model | parser
chain.invoke("What color is Patricia's car?")

'White.'

Another example:

In [42]:
chain.invoke("What car does Lucia drive?")

'Lucia drives an Audi.'

## Loading transcription into the vector store

In [43]:
vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)

Let's set up a new chain using the correct vector store.

In [None]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is LLM?")

"LLMs standing for large language models are basically all the chat bots that you've been hearing about recently. So, ChatGPT, from OpenAI, Claude, from Anthropic, Gemini and Llama, and other types of models like this."

Another exmaple:

In [46]:
chain.invoke("What is photosynthesis?")

"I don't know."

In [47]:
chain.invoke("In Byte Pair Encoding (BPE), what is the key step taken when training a tokenizer?")

'In Byte Pair Encoding (BPE), the key step taken when training a tokenizer is to merge the most common pairs of tokens. Specifically, the process involves repeatedly identifying and merging frequently occurring pairs of tokens within a large corpus of text.'

In [48]:
chain.invoke("What is Gpt2 or Gpt3?")

"I don't know."