In [2]:
!pip install langchain chromadb faiss-cpu  tiktoken  langchain-community  langchain_huggingface



In [3]:
!pip install -q youtube-transcript-api

In [4]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint, ChatHuggingFace
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

# S1) Indexing
1) Document Ingestion
2) Text Chunking
3) Embeddings generation
4) Storing Embeddings in Vector Stores

# Step 1a - Indexing (Document Ingestion)

In [6]:
from youtube_transcript_api._errors import TranscriptsDisabled

video_id = "84ZLMbHefJI"

try:
    fetched_transcript = YouTubeTranscriptApi().fetch(video_id, languages=['en'])
    transcript_list = fetched_transcript.to_raw_data()
    #print(transcript_list)
    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    #print(transcript)
    print("Transcripts fetched successfully")

except TranscriptsDisabled:
    print("No captions available for this video.")
except Exception as e:
    print(f"An error occurred: {e}")

Transcripts fetched successfully


# Step 1b) Indexing (Text Splitting)

In [7]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 120
)
chunks = splitter.create_documents([transcript])

In [8]:
print(len(transcript))
print(len(chunks))

19234
69


In [9]:
chunks[0]

Document(metadata={}, page_content="what is going on everyone in this video we are going to be talking about the difference between latency and throughput in the context of system design now latency and throughput are two very important concepts in terms of measuring the performance of a system so in this video we're going to kind of go over what latency and throughput are and then contrast the differences between the two now first")

# Step 1c & 1d Indexing (Embedding generation and Storing Embeddigs in vector stores)

In [18]:
!pip install sentence-transformers



In [19]:
!pip install --upgrade --force-reinstall numpy

Collecting numpy
  Downloading numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddingModel = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
vectorStore = FAISS.from_documents(chunks, embeddingModel)

In [12]:
vectorStore.index_to_docstore_id

{0: 'fc266e95-4c9f-448f-a340-b5dbdbcce688',
 1: '3e434225-b7cc-41c0-b269-670ab9e09e0b',
 2: 'ab0c6121-3c32-441b-a114-a5f86c727e9e',
 3: '9633a7a3-2290-405a-ac46-2f7ce327245a',
 4: '22894569-cba2-4284-87a3-4af76597b7fd',
 5: 'da1b3d00-38e7-4080-9ed1-eee9cb988dbe',
 6: '760c9c98-428c-4382-b06b-855715c436fc',
 7: '63d98e28-e6b6-4886-9e90-56f90ac4c804',
 8: 'd954beaa-c563-4c00-97c8-9ede8de0d491',
 9: '14e4d0f4-0fd8-4c51-bcab-3df1c42c0e3f',
 10: '7825834a-ae00-4c91-84bb-b1e5d86a3152',
 11: '19cc8bef-63f6-4b9e-b312-fab324bd1589',
 12: '142ba02c-6995-4a40-95b7-a132a46480a1',
 13: '531d44a2-a77d-4233-b871-e3b9c865df95',
 14: 'f137ef16-09e2-4d82-84bf-aa6652382c29',
 15: 'e354ba2a-6628-4beb-875d-93916a745b28',
 16: 'fbba07b4-5490-4d70-b594-51d3435347cd',
 17: 'ea7a1f6d-3947-44f6-b7bb-40982ee83c55',
 18: '16b20015-1870-41b3-a083-1bc8e4a18c1f',
 19: '4bc7cbda-1678-463a-95eb-f4da9b838a28',
 20: 'a0cd4561-3552-4883-8b6f-f8c92882b0cb',
 21: '77f331c0-129c-4ccb-944b-dcf0ec4c7bf7',
 22: 'd13e68c0-8467-

In [13]:
vectorStore.get_by_ids(['a544c5ca-bc2c-4865-8b3d-cdfed166df9f',])

[]

# Step 2) Retrieval

In [14]:
retriever = vectorStore.as_retriever(
    search_type = "mmr",
    search_kwargs = {"k": 4}
)

In [15]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7d1eaca41610>, search_type='mmr', search_kwargs={'k': 4})

In [16]:
retriever.invoke("What is Latency")

[Document(id='22894569-cba2-4284-87a3-4af76597b7fd', metadata={}, page_content="in the context of system design that helps you really grapple the concept so let's say this is our definition let me grab a different color here and okay so let's formalize the definition of latency to be the amount of time amounts of time of time for a packet a packet to be transferred to be transferred yes ford across a network across a network so there's a couple key concepts here and i want"),
 Document(id='a0cd4561-3552-4883-8b6f-f8c92882b0cb', metadata={}, page_content="and finally it'll return all that information back to the client to say either return some data or return nothing just to acknowledge that it's complete so all of these different things say this took 25 milliseconds 25 milliseconds maybe this took 100 milliseconds and then 50 milliseconds to finally go back i consider latency to be the sum of all of these things so 50 plus 25 plus 25 that's 100"),
 Document(id='fbba07b4-5490-4d70-b594-

# Step 3) Augmentation (Context + Query)

In [27]:
llm = HuggingFaceEndpoint(
    repo_id= "Qwen/Qwen3-4B-Instruct-2507",
    task="text-generation",
    temperature=0.3
)

model = ChatHuggingFace(llm=llm)
#"HuggingFaceH4/zephyr-7b-beta",

In [18]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [19]:
ques = "Is the topic of latency discussed in this video? If yes then what was discussed?"
retrievedDocs = retriever.invoke(ques)

In [20]:
retrievedDocs

[Document(id='fc266e95-4c9f-448f-a340-b5dbdbcce688', metadata={}, page_content="what is going on everyone in this video we are going to be talking about the difference between latency and throughput in the context of system design now latency and throughput are two very important concepts in terms of measuring the performance of a system so in this video we're going to kind of go over what latency and throughput are and then contrast the differences between the two now first"),
 Document(id='8c7f2c9f-2a54-4c01-a570-da4a74d8e6f6', metadata={}, page_content="99th percentile so it would be something over here so p99 and there's also a third one which not very useful but it's it's for completion sake which is p100 which is basically whatever the highest uh latency is for your set of data points that's what your p100 would be now these are much more useful terms um when talking about latency because they really give you a better idea of worst case of"),
 Document(id='f137ef16-09e2-4d82-84bf

In [21]:
contextText = "\n\n".join(doc.page_content for doc in retrievedDocs)
contextText

"what is going on everyone in this video we are going to be talking about the difference between latency and throughput in the context of system design now latency and throughput are two very important concepts in terms of measuring the performance of a system so in this video we're going to kind of go over what latency and throughput are and then contrast the differences between the two now first\n\n99th percentile so it would be something over here so p99 and there's also a third one which not very useful but it's it's for completion sake which is p100 which is basically whatever the highest uh latency is for your set of data points that's what your p100 would be now these are much more useful terms um when talking about latency because they really give you a better idea of worst case of\n\nthere is network latency so there's network latency now work and network latency is just things like forming an https handshake it can also be if there's multiple hops so for instance if there was

In [22]:
mainPrompt = prompt.format(
    context=contextText,
    question=ques
)

In [23]:
mainPrompt

"\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      what is going on everyone in this video we are going to be talking about the difference between latency and throughput in the context of system design now latency and throughput are two very important concepts in terms of measuring the performance of a system so in this video we're going to kind of go over what latency and throughput are and then contrast the differences between the two now first\n\n99th percentile so it would be something over here so p99 and there's also a third one which not very useful but it's it's for completion sake which is p100 which is basically whatever the highest uh latency is for your set of data points that's what your p100 would be now these are much more useful terms um when talking about latency because they really give you a better idea of worst case of\n\nthere is network latency so t

# Step 4) Generation

In [28]:
response = model.invoke(mainPrompt)
print(response.content)


Yes, the topic of latency is discussed in this video. 

What was discussed:
- Latency and throughput are two important concepts in measuring system performance.
- Latency refers to the time delay between when a request is made and when a response is received.
- It includes network latency, such as forming an HTTPS handshake, multiple hops in a network path, or delays due to peripheral nodes.
- Various terms related to latency are mentioned, including ping, lag (commonly in gaming), connection quality, delay, and network delay.
- Percentile measures like p99 and p100 are discussed as useful metrics for understanding worst-case latency scenarios.


# Chain Formation

In [34]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [30]:
def format(retrievedDocs):
  contextTextt = "\n\n".join(doc.page_content for doc in retrievedDocs)
  return contextTextt

In [31]:
parallelChain = RunnableParallel({
    'context' : retriever | RunnableLambda(format),
    'question' : RunnablePassthrough()
})

In [32]:
parallelChain.invoke("what is differnce between latency and throughput")

{'context': "going to kind of go over what latency and throughput are and then contrast the differences between the two now first i want to talk about the concept of latency and what it refers to now when people are talking about latency sometimes you hear other synonymous terms or terms that are used to describe latency sometimes you hear people say the speed of an application or the speed of a connection\n\nnetwork latency is often considered to be overhead processing is something that you have more finite control over and with throughput throughput just corresponds to how much so a lot of people like to think of it as a pipe how much information you can fit through a pipe uh you can build a bigger pipe and in the context of this kind of um world of client server model that pipe corresponds to the\n\nper unit time which is second in this case and this is usually what people measure their throughput as now typically in the client server model it's kind of represented through a term ca

In [36]:
parser = StrOutputParser()

In [37]:
chain = parallelChain | prompt | model | parser

In [38]:
chain.invoke("Can You Summarize the video")

'The video discusses the difference between latency and throughput in the context of system design. It explains that latency refers to the time it takes for a system to respond to a request, while throughput measures the amount of work a system can process over time. The video also introduces p99, a statistical measure that represents the 99th percentile of response times, highlighting the fastest 99% of responses. Additionally, it briefly touches on the client-server model, where a client requests information and a server provides it. The video aims to help viewers understand these key performance metrics in system design and encourages viewers to check out other related videos.'