# RAG - helloworld implementation

In [1]:
import chromadb
import pandas as pd
from chromadb.utils import embedding_functions
from langchain_chroma import Chroma
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_core.embeddings import DeterministicFakeEmbedding

from llm_cookbook.utils.get_client import openai_client

In [2]:
movies_data = [
    {
        "name": "The Shawshank Redemption",
        "description": "A wrongfully convicted banker endures life in Shawshank State Penitentiary while maintaining hope and forming profound friendships. The film explores themes of institutionalization and redemption through its 20-year narrative span.",
        "country": "United States",
        "year": 1994,
    },
    {
        "name": "Inception",
        "description": "A skilled thief enters people's dreams to steal secrets, but is tasked with planting an idea in a CEO's subconscious. The film's layered narrative explores reality perception through innovative visual storytelling.",
        "country": "United States",
        "year": 2010,
    },
    {
        "name": "Parasite",
        "description": "A poor Korean family strategically infiltrates a wealthy household, exposing class divisions through dark comedy and thriller elements. The film became the first non-English language Best Picture Oscar winner.",
        "country": "South Korea",
        "year": 2019,
    },
    {
        "name": "Amélie",
        "description": "A whimsical Parisian waitress embarks on a mission to improve others' lives while navigating her own isolation. The film presents a stylized vision of Montmartre with magical realism elements.",
        "country": "France",
        "year": 2001,
    },
    {
        "name": "Spirited Away",
        "description": "A young girl enters a spirit world bathhouse to rescue her transformed parents, encountering mystical creatures and personal growth. This Studio Ghibli masterpiece won the Academy Award for Best Animated Feature.",
        "country": "Japan",
        "year": 2001,
    },
    {
        "name": "The Godfather",
        "description": "A crime epic chronicling the Corleone mafia family's power struggles and the transformation of reluctant heir Michael Corleone. Considered one of cinema's greatest achievements in storytelling.",
        "country": "United States",
        "year": 1972,
    },
    {
        "name": "Pulp Fiction",
        "description": "Interconnected crime stories explore Los Angeles' criminal underworld through non-linear storytelling and iconic dialogue. The film revitalized John Travolta's career and influenced 1990s cinema.",
        "country": "United States",
        "year": 1994,
    },
    {
        "name": "Schindler's List",
        "description": "A German industrialist saves Jewish lives during the Holocaust by employing them in his factories. The black-and-white historical drama won seven Academy Awards including Best Picture.",
        "country": "United States",
        "year": 1993,
    },
    {
        "name": "The Dark Knight",
        "description": "Batman confronts the anarchic Joker in a battle for Gotham City's soul, exploring themes of chaos versus order. Heath Ledger's performance earned a posthumous Academy Award.",
        "country": "United States",
        "year": 2008,
    },
    {
        "name": "Life Is Beautiful",
        "description": "A Jewish father uses imagination to protect his son from the horrors of a Nazi concentration camp. The Italian tragicomedy won three Academy Awards including Best Foreign Language Film.",
        "country": "Italy",
        "year": 1997,
    },
]

df = pd.DataFrame(movies_data)
df.to_csv("data/toy.csv", index=False)

In [4]:
client.delete_collection(name="movie_db")

In [5]:
# Initialize Chroma client with batch optimizations
client = chromadb.PersistentClient()
collection = client.create_collection(
    name="movie_db",
    embedding_function=embedding_functions.DefaultEmbeddingFunction(),
)

# Prepare batch data
documents = []
metadatas = []
ids = []

for _, row in df.iterrows():
    # 1. Create document string
    documents.append(f"{row['name']}: {row['description']}")

    # 2. Prepare metadata
    metadatas.append(
        {
            "country": row["country"],
            "year": (row["year"]),  # Chroma requires string metadata values
        }
    )

    # 3. Generate unique ID (hash of name + year)
    ids.append(f"{hash(row['name'] + str(row['year']))}")

# Add to collection
collection.add(documents=documents, metadatas=metadatas, ids=ids)

In [7]:
results = collection.query(
    query_texts=["Artificial intelligence in film"],
    n_results=3,
    where={"year": {"$gte": 2000}},  # Filter movies from 2000 onwards
)

In [32]:
query_texts = "Any superhero movie you can recommend?"

query_result = collection.query(
    query_texts=query_texts,
    n_results=3,
)["documents"][0]


system_prompt = f"""
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, say that you don't know. 
Use three sentences maximum and keep the answer concise.

{query_result}
"""

message_stack = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query_texts},
]

response = openai_client.chat.completions.create(messages=message_stack, model="gpt-4o")
print(response.choices[0].message.content)

I recommend "The Dark Knight," where Batman battles the Joker in an intense fight for Gotham City's soul. Heath Ledger's iconic performance as the Joker earned him a posthumous Academy Award. The film is widely regarded as one of the best superhero movies ever made.


In [33]:
print(system_prompt)


You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, say that you don't know. 
Use three sentences maximum and keep the answer concise.

["The Dark Knight: Batman confronts the anarchic Joker in a battle for Gotham City's soul, exploring themes of chaos versus order. Heath Ledger's performance earned a posthumous Academy Award.", "Schindler's List: A German industrialist saves Jewish lives during the Holocaust by employing them in his factories. The black-and-white historical drama won seven Academy Awards including Best Picture.", "Pulp Fiction: Interconnected crime stories explore Los Angeles' criminal underworld through non-linear storytelling and iconic dialogue. The film revitalized John Travolta's career and influenced 1990s cinema."]



## Langchain implementation

In [41]:
loader = CSVLoader(file_path="/data/toy.csv")
docs = loader.load_and_split()

embeddings = DeterministicFakeEmbedding(size=4096)
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
)

In [42]:
# Add to collection
vector_store.add_documents(documents=docs, ids=ids)

['-1406321105825303933',
 '-3146088451348060405',
 '5741577411061685671',
 '-4409964805962562201',
 '-67135265337924727',
 '-5759269916624104679',
 '-1265879410418405999',
 '7269556372820919186',
 '2101915283243149551',
 '-7052679438491250289']

In [56]:
query_texts = "Any superhero movie you can recommend?"

query_result = vector_store.similarity_search(
    query_texts,
    k=3,
)

context = ("\n\n").join([res.page_content for res in results])

system_prompt = f"""
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, say that you don't know. 
Use three sentences maximum and keep the answer concise.

{context}
"""

message_stack = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query_texts},
]

response = openai_client.chat.completions.create(messages=message_stack, model="gpt-4o")
print(response.choices[0].message.content)

I don't have information on superhero movies in the provided context, so I can't recommend one.


In [57]:
print(system_prompt)


You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, say that you don't know. 
Use three sentences maximum and keep the answer concise.

name: Spirited Away
description: A young girl enters a spirit world bathhouse to rescue her transformed parents, encountering mystical creatures and personal growth. This Studio Ghibli masterpiece won the Academy Award for Best Animated Feature.
country: Japan
year: 2001

name: Schindler's List
description: A German industrialist saves Jewish lives during the Holocaust by employing them in his factories. The black-and-white historical drama won seven Academy Awards including Best Picture.
country: United States
year: 1993

name: Parasite
description: A poor Korean family strategically infiltrates a wealthy household, exposing class divisions through dark comedy and thriller elements. The film became the first non-English language Best Picture Oscar winn

In [50]:
results = vector_store.similarity_search(
    "batman",
    k=3,
)

In [55]:
print(("\n\n").join([res.page_content for res in results]))

name: Spirited Away
description: A young girl enters a spirit world bathhouse to rescue her transformed parents, encountering mystical creatures and personal growth. This Studio Ghibli masterpiece won the Academy Award for Best Animated Feature.
country: Japan
year: 2001

name: Schindler's List
description: A German industrialist saves Jewish lives during the Holocaust by employing them in his factories. The black-and-white historical drama won seven Academy Awards including Best Picture.
country: United States
year: 1993

name: Parasite
description: A poor Korean family strategically infiltrates a wealthy household, exposing class divisions through dark comedy and thriller elements. The film became the first non-English language Best Picture Oscar winner.
country: South Korea
year: 2019
