# RAG from YT Notes

## Goal
The primary goal of this project is to create a Retrieval-Augmented Generation (RAG) system. This system will utilize notes generated from YouTube video transcripts as its data source for retrieval, enabling effective question answering.

## Process
The RAG system follows a multi-stage process:
1. **Tokenization/Embedding**: Transcripts are first tokenized and embedded to convert text into numerical representations.
2. **Chunking**: The embedded text is then divided into smaller, manageable chunks.
3. **Query Expansion/Re-query**: User queries are expanded or re-queried to improve retrieval effectiveness.
4. **Reranking**: Retrieved documents are reranked to prioritize the most relevant information.
5. **Generation**: Finally, a language model generates answers based on the reranked, retrieved documents.

## Stack
The project leverages the following technologies:
- **Langchain**: For orchestrating the RAG pipeline components.
- **ChromaDB**: As the vector database for storing and retrieving embedded document chunks.
- **Hugging Face (HF)**: For various NLP tasks, including models for transcription, tokenization, and potentially embeddings and text generation.
- **Cohere**: Utilized for its powerful reranking capabilities to improve the relevance of retrieved documents.
- **OpenAI**: For language model capabilities, potentially for summarization and query rewriting.

## RAG with Shorter Videos

In [None]:
pip install langchain_text_splitters langchain_community faiss-cpu langchain-huggingface cohere langchain-openai chromadb

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Getting the transcription of the all the audio files

In [None]:
import torch
from transformers import pipeline
from datasets import load_dataset
import librosa

device = "cuda:0" if torch.cuda.is_available() else "cpu"

whisper_model = pipeline(
  "automatic-speech-recognition",
  model="openai/whisper-small",
  chunk_length_s=30,
  device=device,
)

rl_audio_1 = "YouTube_Agent-Reinforcement-Fine-Tuning-Will-Han_Media.mp3"
rl_audio_2 = "YouTube_Efficient-Reinforcement-Learning-Rhythm_Media.mp3"
rl_audio_3 = "YouTube_RL-Environments-at-Scale-Will-Brown-Prim_Media.mp3"
combined_audio = [rl_audio_1, rl_audio_2, rl_audio_3]
transcription_all = []
for audio in combined_audio:
  raw_audio, sampling_rate = librosa.load(audio, sr=16000)
  transcription = whisper_model(raw_audio.copy(), batch_size=8)["text"]
  transcription_all.append(transcription)

In [None]:
from transformers import AutoTokenizer
# Making sure we got all the results and things look fine
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
for idx, transcript in enumerate(transcription_all):
  print(f'Transcript for source {idx + 1}: ', transcript[:300])
  token_count = len(qwen_tokenizer.encode(transcript))
  print(f"The token count of the 'transcript' is: {token_count} tokens")

In [None]:
# To free up RAM space
del whisper_model

In [None]:
from transformers import pipeline
from tqdm import tqdm

qwen_model = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct", max_new_tokens=1500, device=device)

notes = []
for i, transcript in tqdm(enumerate(transcription_all)):
  current_audio = combined_audio[i]
  messages = [
    {"role": "system", "content": "You are an expert at converting transcriptions into detailed analysis that can be used to learn the content of the transcription. Analyse the transcription and create a well-structured (according to current educational best practices) analysis about all the key points discussed. Start with an executive summary in a couple of sentence, not other special intro needed."},
    {"role": "user", "content": f"Discuss all the key points in detail of the provided transcript {transcript}. Make sure to structure your answer to facilitate learning from it, hence following the educational best practices. Avoid using bullet points, use full sentences and make it very detailed"},
]
  note_content = qwen_model(messages)[0]['generated_text'][-1]['content']
  notes.append({
      "summary": note_content,
      "metadata": {"source": current_audio}
  })


In [None]:
notes[0]['summary']

In [None]:
# Let's check the token length of each new summary note
for idx, summary in enumerate(notes):
  token_count = len(qwen_tokenizer.encode(summary['summary']))
  print(f"The token count of the Summary #{idx+1} is: {token_count} tokens")

In [None]:
del qwen_model

In [None]:
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter
print(
    f"Model's maximum sequence length: {SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').max_seq_length}"
)

In [None]:
from langchain_core.documents import Document as LangchainDocument

YT_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=note["summary"], metadata=note["metadata"])
    for note in tqdm(notes)
]


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import Optional, List

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=False,
        strip_whitespace=True,
    )

    notes_processed = []
    for doc in knowledge_base:
        notes_processed += text_splitter.split_documents([doc])

    return notes_processed


notes_processed = split_documents(
    256,  # We choose a chunk size adapted to our model
    YT_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
lengths = [len(tokenizer.encode(note.page_content)) for note in tqdm(notes_processed)]
fig = pd.Series(lengths).hist()
plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
plt.show()

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    notes_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [None]:
USER_QUERY = "What are the best practices of training an RL model or Agent?"

In [None]:
query_vector = embedding_model.embed_query(USER_QUERY)

print(f"\nStarting retrieval for {USER_QUERY=}...")
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=USER_QUERY, k=6)
print(
    "\n==================================Top document=================================="
)
print(retrieved_docs[0].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[0].metadata)

In [None]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

retriever_model_name = "LiquidAI/LFM2.5-1.2B-Instruct"

lqd_model = AutoModelForCausalLM.from_pretrained(
    retriever_model_name
).to(device)
lqd_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)

RETRIEVER_LLM = pipeline(
    model=lqd_model,
    tokenizer=lqd_tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=800,
)

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Your responses should be relevant to the question and at the end, ask the user for any follow up question related to the response.
If the answer cannot be deduced from the context, state that to the user.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Question: {question}""",
    },
]
YT_RAG_PROMPT_TEMPLATE = lqd_tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)

print(YT_RAG_PROMPT_TEMPLATE)

In [None]:
import cohere as Cohere
from google.colab import userdata
COHERE_API_KEY = 'YOUR_COHERE_API_KEY'

co = Cohere.ClientV2(api_key=COHERE_API_KEY)

results = co.rerank(
    model="rerank-v3.5", query=USER_QUERY, documents=[doc.page_content for doc in retrieved_docs], top_n=3
)

# Display the reranking results
for idx, result in enumerate(results.results):
    print(f"Rank: {idx+1}")
    print(f"Score: {result.relevance_score}")
    print(f"Document: {retrieved_docs[result.index]}\n")
reranked_documents = [
    retrieved_docs[result.index] for result in results.results
]

In [None]:
results.results

In [None]:
from typing import Tuple

def answer_with_rag(
    question: str,
    llm: pipeline,
    knowledge_index: FAISS,
    reranker = co,
    num_retrieved_docs: int = 6,
    num_docs_final: int = 4,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    if reranker:
        print("=> Reranking documents...")
        reranked_relevant_docs = reranker.rerank(model="rerank-v3.5", query=question, documents=relevant_docs, top_n=num_docs_final)
        relevant_docs = [relevant_docs[doc.index] for doc in reranked_relevant_docs.results]

    relevant_docs = relevant_docs[:num_docs_final]


    # Build the final prompt
    context = relevant_docs

    final_prompt = YT_RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs

In [None]:
rag_answer, relevant_docs = answer_with_rag(USER_QUERY, RETRIEVER_LLM, KNOWLEDGE_VECTOR_DATABASE)
print(f'Returned answer: ', rag_answer)
print(f'Relevant docs for the returned answer: ', relevant_docs)

## Using Longer Video Input

In [None]:
import torch
from pydantic_settings import BaseSettings
from pydantic import BaseModel
from google.colab import userdata

device = "cuda:0" if torch.cuda.is_available() else "cpu"

class Settings(BaseSettings):
  # Audio path
  AUDIO_PATH: str = 'Building-Agents-with-Model-Context-Protocol.mp3'
  AUDIO_SOURCE: str = 'Building-Agents-with-Model-Context-Protocol'
  # Transcribe
  AUDIO_MODEL: str = 'openai/whisper-large-v3-turbo'
  # Generate text
  TEXT_GENERATION: str = 'HuggingFaceTB/SmolLM3-3B'
  OPENAI_KEY: str = 'YOUR_OPENAI_API_KEY'
  HF_TOKEN: str = 'YOUR_HF_WRITE_ACCESS'
  COHERE_API_KEY: str = 'YOUR_COHERE_API_KEY'
  # RAG
  TEXT_EMBEDDING_MODEL_ID: str = 'ibm-granite/granite-embedding-small-english-r2'
  RERANKING_CROSS_ENCODER_MODEL_ID: str = 'cross-encoder/ms-marco-MiniLM-L6-v2'
  RAG_MODEL_DEVICE: str = "cpu"

settings = Settings()

In [None]:
from langchain_core.prompts import ChatPromptTemplate


class QueryRewritingTemplate(BaseModel):
    prompt: str = """You are an expert at paraphrasing user queries and breaking them down into smaller subqueries.
    Your task is to first paraphrase/make it more concrete, clear and then break this down {break_down_to_n}
    different sub-queries of the given user question to retrieve relevant documents from a vector
    database. By paraphrasing the user question, your aim is to add further more concrete details
    that could help the user to get the full or better picture. Then, by breaking down such paraphrased query,
    your goal is to find and retrieve the little nuances of the query. Provide these sub-queries seperated by '{separator}'.
    Original question: {question}
    """

    @property
    def separator(self) -> str:
        return "#next-subquery#"

    def create_template(self, break_down_to_n: int) -> ChatPromptTemplate:
        base_prompt = ChatPromptTemplate.from_messages(
            [
                ("human", self.prompt)
            ]
        )
        return base_prompt.partial(
            separator=self.separator,
            break_down_to_n=break_down_to_n,
        )


In [None]:
import librosa
from transformers import pipeline


def transcribe(audio_path:str, model_name:str, device:str, chunk_length=30):
  speech_rec_model = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    chunk_length_s=chunk_length,
    stride_length_s=chunk_length / 3,
    device=device,
  )
  raw_audio, sampling_rate = librosa.load(audio_path, sr=16000)
  transcription = speech_rec_model(raw_audio.copy(), batch_size=8)["text"]

  del speech_rec_model

  return transcription

transcription = transcribe(settings.AUDIO_PATH, settings.AUDIO_MODEL, device)

In [None]:
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter
print(
    f"Model's maximum sequence length: {SentenceTransformer(settings.TEXT_EMBEDDING_MODEL_ID).max_seq_length}"
)

In [None]:
from langchain_openai import ChatOpenAI
from typing import Optional, List, Tuple
from langchain_core.documents import Document as LangchainDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(
    chunk_size: int,
    transcription: str,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
    )

    knowledge_base = LangchainDocument(page_content=transcription, metadata={'summary': settings.AUDIO_SOURCE})

    transcript_processed = text_splitter.split_documents([knowledge_base])

    return transcript_processed

# Using a not too big token number to retain some granularity
chunked_transcript = split_documents(2500, transcription)

In [None]:
def add_summ_metadata(chunked_transcript: List[LangchainDocument]):
  sum_metadata = ChatOpenAI(model="gpt-5-nano", api_key=settings.OPENAI_KEY)

  for transcript in chunked_transcript:
    # Come up with 1 sentence summary for each chunk
    summary_prompt = f"Write a one sentence summary of what the key topic/points of the passage is. This will be used for filtering purposes to get what the content is about more quickly. It should strictly be just one sentence. Chunk: {transcript.page_content}"
    response_summary = sum_metadata.invoke(summary_prompt)

    # Modify the 'summary' metadata with the response of the LLM
    transcript.metadata['summary'] = response_summary.content

  return chunked_transcript

chunked_transcript = add_summ_metadata(chunked_transcript)

In [None]:
chunked_transcript[0].metadata

In [None]:
# Custom Emebdding function for HF
import requests
import numpy as np
from chromadb.api.types import EmbeddingFunction

class HuggingFaceRouterEmbeddingFunction(EmbeddingFunction):

    def __init__(self, api_key, model_name):
        self.api_key = api_key
        self.model_name = model_name

        self.url = (
            f"https://router.huggingface.co/hf-inference/models/"
            f"{model_name}/pipeline/feature-extraction"
        )

        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }

    def __call__(self, texts):
        payload = {"inputs": texts}

        r = requests.post(self.url, headers=self.headers, json=payload)

        if r.status_code != 200:
            raise RuntimeError(
                f"HuggingFace API error {r.status_code}: {r.text}"
            )

        data = r.json()

        # Return as float32 numpy arrays (what Chroma expects)
        return [np.array(vec, dtype=np.float32) for vec in data]

In [None]:
from langchain_core.documents import Document as LangchainDocument
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

chroma_client = chromadb.PersistentClient(path="./chroma_transcripts_db")
# this is where the indexed embeddings, transcripts and metadata will be stored
transcript_collection = chroma_client.create_collection(
    name="yt_long_transcript",
    embedding_function=HuggingFaceRouterEmbeddingFunction(
      api_key=settings.HF_TOKEN,
      model_name=settings.TEXT_EMBEDDING_MODEL_ID
    ),
    configuration={
        "hnsw": {
            "space": "cosine",
            "ef_construction": 250 # size of the candidate list used to select neighbors during index creation: high number means better accuracy, but more compute cost
        }
    }
)


In [None]:
transcript_collection.add(
    ids=[f"mcp_transcript_{i}" for i in range(len(chunked_transcript))],
    documents=[
      transcript.page_content
      for transcript in chunked_transcript
    ],
    metadatas=[{'summary': summary.metadata['summary']} for summary in chunked_transcript]
)

In [None]:
transcript_collection.get(ids=['mcp_transcript_0'])

In [None]:
def query_database(query_text, n_results=10):
    results = transcript_collection.query(query_texts=query_text, n_results=n_results, include=['documents', 'metadatas'])
    return results

Loading the model and creating the prompt

In [None]:
from torch import cuda, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(settings.TEXT_GENERATION)
model = AutoModelForCausalLM.from_pretrained(settings.TEXT_GENERATION,
                                             device_map=device,
                                            torch_dtype=torch.bfloat16)


In [None]:
class QueryRewriting():
    def generate(self, query: str, break_down_to_n: int) -> str:
        assert break_down_to_n > 0, f"'break_down_to_n' should be greater than 0. Got {break_down_to_n}."

        query_rewriting_template = QueryRewritingTemplate()
        prompt = query_rewriting_template.create_template(break_down_to_n - 1)
        model = ChatOpenAI(model='gpt-5-mini', api_key=settings.OPENAI_KEY, temperature=0.2)

        chain = prompt | model

        response = chain.invoke({"question": query})
        sub_queries = response.content

        queries_content = sub_queries.strip().split(query_rewriting_template.separator)

        return queries_content

In [None]:
final_llm_prompt = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the questions.
Your responses should be relevant to the question and at the end, ask the user for any follow up question related to the response.
If the answer cannot be deduced from the context, state that to the user.
/no_think""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Question: {question}""",
    },
]
YT_LONG_RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    final_llm_prompt, tokenize=False, add_generation_prompt=True
)

In [None]:
print(YT_LONG_RAG_PROMPT_TEMPLATE)

In [None]:
import cohere
COHERE_API_KEY = 'YOUR_COHERE_API_KEY'

def answer_from_transcript(user_query: str):
  co = cohere.ClientV2(api_key=settings.COHERE_API_KEY)
  query_rewriting = QueryRewriting()
  rewrote_query = query_rewriting.generate(user_query, break_down_to_n=5)
  contexts = []
  for sub_query in rewrote_query:

    db_finding = query_database(sub_query, n_results=7)

    # Flatten the list of lists into a single list of strings for Cohere reranking
    relevant_docs_flat = [item for sublist in db_finding['documents'] for item in sublist]

    rerank_results = co.rerank(
      model="rerank-v3.5", query=user_query, documents=relevant_docs_flat, top_n=3
    )
    reranked_documents = [
      relevant_docs_flat[result.index] for result in rerank_results.results
    ]
    contexts.append(reranked_documents)

  final_prompt = YT_LONG_RAG_PROMPT_TEMPLATE.format(question=user_query, context=contexts)

  input_ids = tokenizer(final_prompt, return_tensors="pt").input_ids.to(model.device)

  outputs = model.generate(input_ids, max_new_tokens=3000)
  return tokenizer.decode(outputs[0][input_ids.shape[-1]:])

In [None]:
USER_QUERY = "What are the core concepts of MCP?"
response_rag = answer_from_transcript(USER_QUERY)

In [None]:
response_rag