# Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `project/starter/games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```


### Setup

In [89]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
from pydantic import BaseModel,Field
from typing import Annotated,List,Dict
import pandas as pd

from rich.console import Console
from rich.markdown import Markdown
console=Console()

In [None]:
# Load environment variables
load_dotenv()

True

### VectorDB Instance

In [None]:
# Instantiate your ChromaDB Client
chroma_client = chromadb.PersistentClient(path="chromadb")

### Collection

In [None]:
# embedding function
embedding_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    )

In [None]:
# Create a collection
if "udaplay" not in [c.name for c in chroma_client.list_collections()]:
    collection = chroma_client.create_collection(
        name="udaplay",
        embedding_function=embedding_fn
        )
else:
    collection = chroma_client.get_collection("udaplay")

### Add documents

In [8]:
# Make sure you have a directory "project/starter/games"
data_dir = "games"

for file_name in sorted(os.listdir(data_dir)):
    if not file_name.endswith(".json"):
        continue

    file_path = os.path.join(data_dir, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        game = json.load(f)

    # You can change what text you want to index
    content = f"[{game['Platform']}] {game['Name']} ({game['YearOfRelease']}) - {game['Description']}"

    # Use file name (like 001) as ID
    doc_id = os.path.splitext(file_name)[0]

    collection.add(
        ids=[doc_id],
        documents=[content],
        metadatas=[game]
    )

### Retrieve documents

In [24]:
# Access each list item as model objects, ie result.Name, result.Description, ect...
all_records = collection.get()
print(all_records.keys())

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])


In [25]:
class retrievedDocument(BaseModel):
    """Pydantic data model for retrieved documents from the vectorstore"""
    Platform: Annotated[str, Field(description="Game platform (PC, Xbox, Wii...)")]
    Name: Annotated[str, Field(description="Title of the game")]
    YearOfRelease: Annotated[int, Field(description="Year of release")]
    Description: Annotated[str, Field(description="One-liner game description")]

In [26]:
metadatas = all_records['metadatas']
platforms=[];names=[];years=[];descriptions=[]
# Use pydantic data model
# Now you can access them as model objects, ie result.Name, result.Description, ect...
documents = [retrievedDocument(**metadata) for metadata in metadatas]

In [27]:
console.print(f"{len(documents)} retrieved from the database. Examples: ", style='bold yellow')
documents[:3]

[retrievedDocument(Platform='PlayStation 1', Name='Gran Turismo', YearOfRelease=1997, Description='A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.'),
 retrievedDocument(Platform='PlayStation 2', Name='Grand Theft Auto: San Andreas', YearOfRelease=2004, Description="An expansive open-world game set in the fictional state of San Andreas, following the story of Carl 'CJ' Johnson."),
 retrievedDocument(Platform='PlayStation 3', Name='Gran Turismo 5', YearOfRelease=2010, Description='A comprehensive racing simulator featuring a vast selection of vehicles and tracks, with realistic driving physics.')]

In [82]:
def print_record(collectionName:str, id:int):
    """
    retrieves a item from a chromadb collection
    inputs:
        - collection name (str)
        - id (int)
    outputs:
        - record id (dict) 
    """

    chroma_client = chromadb.PersistentClient(path="chromadb")

    if collectionName in [c.name for c in chroma_client.list_collections()]:
        collection = chroma_client.get_collection(collectionName)
    else:
        console.print(f"No collection '{collectionName}' found", style='bold red')
        return None
    
    all_records = collection.get()
    metadatas = all_records['metadatas']

    if f'{id:03}' in all_records['ids']:
        return metadatas[id]
    else:
        console.print(f"Item '{id}' does not exist", style='bold red')
        return None

In [81]:
print_record('udaplay', 1)

{'Description': "An expansive open-world game set in the fictional state of San Andreas, following the story of Carl 'CJ' Johnson.",
 'Genre': 'Action-adventure',
 'Publisher': 'Rockstar Games',
 'Platform': 'PlayStation 2',
 'YearOfRelease': 2004,
 'Name': 'Grand Theft Auto: San Andreas'}

# DB overview

In [90]:
def retrieve_all_games()->List[retrievedDocument]:
    """
    Retrieves all records from the database
    Output:
    You'll receive all records as list. Each element contains:
    - Platform: like Game Boy, Playstation 5, Xbox 360...)
    - Name: Name of the Game
    - YearOfRelease: Year when that game was released for that platform
    - Description: Additional details about the game
    """
    chroma_client = chromadb.PersistentClient(path="chromadb")
    collection = chroma_client.get_collection("udaplay")
    # Retrieve all records
    all_records = collection.get()

    if all_records:
        metadatas = all_records['metadatas']
        platforms=[];names=[];years=[];descriptions=[]
        # Use pydantic data model
        # Now you can access them as model objects, ie result.Name, result.Description, ect...
        documents = [retrievedDocument(**metadata) for metadata in metadatas]
        
    return documents

In [93]:
temp={}
all_records = retrieve_all_games()
for i in range(len(all_records)):
    temp[i] = pd.DataFrame(json.loads(all_records[i].model_dump_json()), index=[i])
records_df = pd.concat(temp,ignore_index=True)
records_df

Unnamed: 0,Platform,Name,YearOfRelease,Description
0,PlayStation 1,Gran Turismo,1997,A realistic racing simulator featuring a wide ...
1,PlayStation 2,Grand Theft Auto: San Andreas,2004,An expansive open-world game set in the fictio...
2,PlayStation 3,Gran Turismo 5,2010,A comprehensive racing simulator featuring a v...
3,PlayStation 4,Marvel's Spider-Man,2018,An open-world superhero game that lets players...
4,PlayStation 5,Marvel's Spider-Man 2,2023,"The sequel to the acclaimed Spider-Man game, f..."
5,Game Boy Color,Pokémon Gold and Silver,1999,Second-generation Pokémon games introducing ne...
6,Game Boy Advance,Pokémon Ruby and Sapphire,2002,Third-generation Pokémon games set in the Hoen...
7,Super Nintendo Entertainment System (SNES),Super Mario World,1990,A classic platformer where Mario embarks on a ...
8,Nintendo 64,Super Mario 64,1996,A groundbreaking 3D platformer that set new st...
9,GameCube,Super Smash Bros. Melee,2001,A crossover fighting game featuring characters...


In [94]:
table = pd.pivot_table(
    records_df, 
    index=['Platform'], 
    columns=['YearOfRelease'], 
    values=['Name'],
    aggfunc='count',
    fill_value=0,
    margins=True, 
    margins_name="Total" 
    )

table.replace(0,'')

Unnamed: 0_level_0,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name,Name
YearOfRelease,1990,1996,1997,1999,2001,2002,2004,2006,2010,2014,2017,2018,2021,2022,2023,2024,2025,Total
Platform,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Game Boy Advance,,,,,,1.0,,,,,,,,,,,,1
Game Boy Color,,,,1.0,,,,,,,,,,,,,,1
GameCube,,,,,1.0,,,,,,,,,,,,,1
Nintendo 64,,1.0,,,,,,,,,,,,,,,,1
Nintendo Switch,,,,,,,,,,,1.0,,,,2.0,,,3
PC,,,,,,,,,,,,,,,1.0,,4.0,5
PlayStation 1,,,1.0,,,,,,,,,,,,,,,1
PlayStation 2,,,,,,,1.0,,,,,,,,,,,1
PlayStation 3,,,,,,,,,1.0,,,,,,,,,1
PlayStation 4,,,,,,,,,,,,1.0,,1.0,1.0,,,3
