# L32 - Retrieval Augmented Generation (RAG)

![image_description](Retrieval%20Augmented%20Generation%20Design%20Pattern.webp)

# Naive implementation

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

openai_client = OpenAI(api_key='')

In [3]:
import numpy as np

from typing import List, Dict
import pickle

In [69]:
documents = [
    "OpenAI was founded in 2015. Its mission is to ensure artificial intelligence benefits all of humanity.",
    "GPT-4 was released in 2023. It shows improved capabilities in reasoning and creative tasks.",
    "Claude is an AI assistant created by Anthropic, known for its helpful and honest responses and amazing performance for coding tasks.",
    "Witcher 3 is a game about a monster hunter named Geralt of Rivia. It was released in 2015.",
    "Witcher 4 is new game in the series where Ciri is the main character. It is expected to be released in 2025.",
    "Real Madrid is the best football club in the world. The club won 14 Champions League titles and main stadium is called Santiago Bernabeu, located Avenida de Concha Espina, 1. You can get there by metro line 10, station is Santiago Bernabeu.",
    "Astana is the capital of Kazakhstan. It is located in the north-central part of the country, on the Ishim River. Other names of the city are Nur-Sultan and Akmola.",
    "Adilet is the mentor at Outpeer, he is born in March 30, 1993 in Astana. He currently lives in Madrid, Spain.",
    "Alibek is a student at SDU university and recently finished his bachelors first year study. He currently lives in almaty",
    "Kairat arena is a stadium located in Almaty",
    "Alibek likes to play games"
]

In [70]:
def get_embedding(text: str) -> List[float]:
    """Get embedding for a text using OpenAI's embedding model."""
    response = openai_client.embeddings.create(
        model="text-embedding-3-small", # 1536 dimensions
        input=text
    )
    return response.data[0].embedding

def cosine_similarity(a: List[float], b: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [71]:
doc_embeddings = {}
for i, doc in enumerate(documents):
    doc_embeddings[i] = get_embedding(doc)

# Save embeddings to avoid regenerating them
with open('doc_embeddings.pkl', 'wb') as f:
    pickle.dump(doc_embeddings, f)

In [72]:
len(doc_embeddings)

11

In [76]:
def find_relevant_docs(query: str, doc_embeddings: Dict, top_k: int = 8) -> List[str]:
    """Find most relevant documents for a query."""
    query_embedding = get_embedding(query)

    # Calculate similarities - O(n)
    similarities = {}
    for idx, doc_embedding in doc_embeddings.items():
        similarity = cosine_similarity(query_embedding, doc_embedding)
        if similarity > 0:
            similarities[idx] = similarity

    # Get top k relevant documents
    relevant_doc_indices = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k] # O(nlog n)

    # time complexity: O(n log n) + O(n)


    return [documents[idx] for idx, _ in relevant_doc_indices]

In [77]:
find_relevant_docs("Give everything related to Alibek", doc_embeddings, top_k=11)

['Alibek likes to play games',
 'Alibek is a student at SDU university and recently finished his bachelors first year study. He currently lives in almaty',
 'Kairat arena is a stadium located in Almaty',
 'Astana is the capital of Kazakhstan. It is located in the north-central part of the country, on the Ishim River. Other names of the city are Nur-Sultan and Akmola.',
 'Adilet is the mentor at Outpeer, he is born in March 30, 1993 in Astana. He currently lives in Madrid, Spain.',
 'GPT-4 was released in 2023. It shows improved capabilities in reasoning and creative tasks.',
 'Claude is an AI assistant created by Anthropic, known for its helpful and honest responses and amazing performance for coding tasks.',
 'OpenAI was founded in 2015. Its mission is to ensure artificial intelligence benefits all of humanity.',
 'Witcher 4 is new game in the series where Ciri is the main character. It is expected to be released in 2025.',
 'Real Madrid is the best football club in the world. The clu

In [78]:
prompt_template = """
Use the following context to answer the question. If the answer cannot be found in the context, say `I don't have enough information to answer that.`

Context:
{context}

Question: {query}
Answer:

"""

def get_rag_response(query: str, relevant_docs: List[str]) -> str:
    """Get response using RAG approach."""
    # Create prompt with context
    context = "\n".join(relevant_docs)
    prompt = prompt_template.format(context=context, query=query)

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
    )

    return response.choices[0].message.content


In [79]:
user_query = "What stadium Alibek can visit?"

relevant_docs = find_relevant_docs(user_query, doc_embeddings)
print("Relevant documents:", relevant_docs)

answer = get_rag_response(user_query, relevant_docs)
print("\nAnswer:", answer)


Relevant documents: ['Kairat arena is a stadium located in Almaty', 'Alibek likes to play games', 'Alibek is a student at SDU university and recently finished his bachelors first year study. He currently lives in almaty', 'Astana is the capital of Kazakhstan. It is located in the north-central part of the country, on the Ishim River. Other names of the city are Nur-Sultan and Akmola.', 'Adilet is the mentor at Outpeer, he is born in March 30, 1993 in Astana. He currently lives in Madrid, Spain.', 'Real Madrid is the best football club in the world. The club won 14 Champions League titles and main stadium is called Santiago Bernabeu, located Avenida de Concha Espina, 1. You can get there by metro line 10, station is Santiago Bernabeu.', 'GPT-4 was released in 2023. It shows improved capabilities in reasoning and creative tasks.', 'OpenAI was founded in 2015. Its mission is to ensure artificial intelligence benefits all of humanity.']

Answer: Alibek can visit Kairat arena, which is loca

### [Discussion]: let's criticize this design

1. What specifications we need here?
2. What is important? What metrics?
3. What parts of design could be improved?
4. What is missing?

### Improvement #1: lets introduce vector database

Pinecone -> service

PostrgreSQL + PGVector

Weaviate


sqlite analog in vector db is chromadb


In [80]:
import chromadb
from chromadb.utils import embedding_functions

In [81]:
chroma_client = chromadb.Client()

In [82]:
if "documents" in [collection.name for collection in chroma_client.list_collections()]:
    print("Collection already exists. Deleting it...")
    chroma_client.delete_collection("documents")
    print("Collection deleted")
else:
    print("No collection found")

Collection already exists. Deleting it...
Collection deleted


In [83]:
openai_embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-small"
)

collection = chroma_client.create_collection(
    name="documents",
    embedding_function=openai_embedding_function,
    metadata={"hnsw:space": "cosine"}
)

In [84]:
collection.add(
    documents=documents,
    ids=[str(i) for i in range(len(documents))],
    metadatas=[{"year": 2024, "source": "manual"} for _ in range(len(documents))]
)

In [85]:
collection.peek()

{'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
 'embeddings': array([[-0.00794725, -0.01631136,  0.03839477, ...,  0.00548344,
         -0.00082038, -0.01822469],
        [-0.00891934,  0.01599939,  0.06424951, ...,  0.00273375,
          0.01734737,  0.01989215],
        [ 0.01125212,  0.01812918,  0.01349555, ..., -0.02542559,
         -0.00389631,  0.00995917],
        ...,
        [-0.02045739, -0.04141772,  0.0388024 , ..., -0.01632064,
          0.00251788, -0.01975326],
        [-0.0255367 , -0.06149566, -0.01169801, ..., -0.02878014,
         -0.04004569,  0.00813563],
        [-0.05765361, -0.03480198,  0.00484058, ...,  0.00058298,
         -0.02378667, -0.01312486]], shape=(10, 1536)),
 'documents': ['OpenAI was founded in 2015. Its mission is to ensure artificial intelligence benefits all of humanity.',
  'GPT-4 was released in 2023. It shows improved capabilities in reasoning and creative tasks.',
  'Claude is an AI assistant created by Anthropic, known for it

In [86]:
results = collection.query(
    query_texts=["When OpenAI was founded?"],
    n_results=3,
    where={"year": 2024},
    include=["documents", "metadatas", "distances"]
)
results

{'ids': [['0', '2', '1']],
 'embeddings': None,
 'documents': [['OpenAI was founded in 2015. Its mission is to ensure artificial intelligence benefits all of humanity.',
   'Claude is an AI assistant created by Anthropic, known for its helpful and honest responses and amazing performance for coding tasks.',
   'GPT-4 was released in 2023. It shows improved capabilities in reasoning and creative tasks.']],
 'uris': None,
 'included': ['documents', 'metadatas', 'distances'],
 'data': None,
 'metadatas': [[{'year': 2024, 'source': 'manual'},
   {'year': 2024, 'source': 'manual'},
   {'source': 'manual', 'year': 2024}]],
 'distances': [[0.24819272756576538, 0.6726889610290527, 0.737695574760437]]}

In [87]:
def query_documents(query: str, n_results: int = 2) -> List[str]:
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    return results["documents"][0]

query_documents("When OpenAI was founded?")

['OpenAI was founded in 2015. Its mission is to ensure artificial intelligence benefits all of humanity.',
 'Claude is an AI assistant created by Anthropic, known for its helpful and honest responses and amazing performance for coding tasks.']

In [90]:
user_query = "Where Alibek studies?"

relevant_docs = query_documents(user_query, n_results=3)
print("Relevant documents:", relevant_docs)

answer = get_rag_response(user_query, relevant_docs)
print("\nAnswer:", answer)


Relevant documents: ['Alibek is a student at SDU university and recently finished his bachelors first year study. He currently lives in almaty', 'Alibek likes to play games', 'Kairat arena is a stadium located in Almaty']

Answer: Alibek studies at SDU university.


### Improvement #2: LangChain

In [92]:
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import ChatPromptTemplate

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [101]:
try:
    chroma_client.delete_collection("my_documents")
except Exception as e:
    print(e)
    pass

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma(
    embedding_function=embeddings,
    collection_name="my_documents",
)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)
retriever


VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x75956e7f9a10>, search_kwargs={'k': 5})

In [102]:
retriever.get_relevant_documents("When OpenAI was founded?")

[]

In [103]:
docs = [
    Document(
        page_content=text,
        metadata={"source": "manual", "year": 2024}
    )
    for text in documents
]
vectorstore.add_documents(docs)

['f4296045-82c0-4aa4-8e73-daa87ccfc03f',
 'aac6cf0a-0ca4-4f7e-b964-5c3ed0c1759e',
 'f05ddfce-6717-4c11-9c0c-1ea933ed5c22',
 'a6658153-8267-4f84-bea1-c3c46cd8de2a',
 'd3cedad4-e2cc-448f-9f2e-35c2acb92afa',
 '8ce3aa8d-e985-4363-8640-c28e81a74021',
 '1dce928d-c326-42b4-972a-d578f4e09349',
 '159bccb2-2955-4f47-8a93-3bda18562ea7',
 '0b5b9780-c5e9-492b-9496-95317c94382e',
 '4b3a0862-fc52-4444-862e-27f37501287b',
 'dbddaa5b-2c9f-4259-851a-b715ef95a164']

In [104]:
retriever.get_relevant_documents("What stadium can Adilet visit?")

[Document(metadata={'source': 'manual', 'year': 2024}, page_content='Kairat arena is a stadium located in Almaty'),
 Document(metadata={'year': 2024, 'source': 'manual'}, page_content='Adilet is the mentor at Outpeer, he is born in March 30, 1993 in Astana. He currently lives in Madrid, Spain.'),
 Document(metadata={'source': 'manual', 'year': 2024}, page_content='Real Madrid is the best football club in the world. The club won 14 Champions League titles and main stadium is called Santiago Bernabeu, located Avenida de Concha Espina, 1. You can get there by metro line 10, station is Santiago Bernabeu.'),
 Document(metadata={'source': 'manual', 'year': 2024}, page_content='Alibek likes to play games'),
 Document(metadata={'year': 2024, 'source': 'manual'}, page_content='Alibek is a student at SDU university and recently finished his bachelors first year study. He currently lives in almaty')]

In [105]:

PROMPT_TEMPLATE = """
Use the following context to answer the question.
If the answer cannot be found in the context, say `Why do you ask me that?`
Add "Bro"

Context:
{context}

Question: {question}
Answer:
"""

PROMPT = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
    retriever=retriever,
    chain_type="stuff", # stuff, map_reduce, refine
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [111]:
qa_chain({"query": "Can Adilet visit Santiago Bernabeu?"})

{'query': 'Can Adilet visit Santiago Bernabeu?',
 'result': 'Why do you ask me that? Bro',
 'source_documents': [Document(metadata={'year': 2024, 'source': 'manual'}, page_content='Real Madrid is the best football club in the world. The club won 14 Champions League titles and main stadium is called Santiago Bernabeu, located Avenida de Concha Espina, 1. You can get there by metro line 10, station is Santiago Bernabeu.'),
  Document(metadata={'year': 2024, 'source': 'manual'}, page_content='Adilet is the mentor at Outpeer, he is born in March 30, 1993 in Astana. He currently lives in Madrid, Spain.'),
  Document(metadata={'year': 2024, 'source': 'manual'}, page_content='Kairat arena is a stadium located in Almaty'),
  Document(metadata={'source': 'manual', 'year': 2024}, page_content='Alibek likes to play games'),
  Document(metadata={'year': 2024, 'source': 'manual'}, page_content='Alibek is a student at SDU university and recently finished his bachelors first year study. He currently 

In [113]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

qa_chain_with_memory = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.2),
    chain_type="stuff",
    retriever=retriever,
    memory=memory
)

In [116]:
qa_chain_with_memory({"query": "Can Adilet vist Santiago Bernabeu"})

{'query': 'Can Adilet vist Santiago Bernabeu',
 'chat_history': [HumanMessage(content='Can Adilet vist Santiago Bernabeu', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='Can Adilet vist Santiago Bernabeu', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='Can Adilet vist Santiago Bernabeu', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.', additional_kwargs={}, response_metadata={})],
 'result': 'Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.'}

In [117]:
qa_chain_with_memory({"query": "What stadium can Adilet visit?"})

{'query': 'What stadium can Adilet visit?',
 'chat_history': [HumanMessage(content='Can Adilet vist Santiago Bernabeu', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='Can Adilet vist Santiago Bernabeu', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='Can Adilet vist Santiago Bernabeu', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Yes, Adilet can visit Santiago Bernabeu since he currently lives in Madrid, Spain.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='What stadium can Adilet visit?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Adilet can visit the Kairat arena in Almaty, as i

In [54]:
# Format documents with reference numbers
def format_docs(docs):
    return "\n".join(f"[{i+1}] {doc.page_content}" for i, doc in enumerate(docs))

prompt = ChatPromptTemplate.from_template("""
Answer the question based on the following context.
Include reference numbers [1], [2] etc. to cite your sources. At the end of the answer, include the sources themselves.

Context:
{context}

Question: {question}

Answer with sources:""")

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

response = chain.invoke("What stadium can Adilet visit?")
print(response)

Adilet can visit the Santiago Bernabeu stadium, which is the home of Real Madrid, located in Madrid, Spain. This stadium is renowned as one of the best in the world and is accessible via metro line 10 at the Santiago Bernabeu station [2]. 

Sources:
[1] Adilet is the mentor at Outpeer, he is born in March 30, 1993 in Astana. He currently lives in Madrid, Spain.  
[2] Real Madrid is the best football club in the world. The club won 14 Champions League titles and main stadium is called Santiago Bernabeu, located Avenida de Concha Espina, 1. You can get there by metro line 10, station is Santiago Bernabeu.  
[3] Astana is the capital of Kazakhstan. It is located in the north-central part of the country, on the Ishim River. Other names of the city are Nur-Sultan and Akmola.


# Formats of Files?

1. Discussion
2. Design exercise: YouTube video searcher

# OpenAI RAG