# [SOLUTION] Udaplay Project

## Part 01 - Offline RAG

In this part of the project, you'll build your VectorDB using Chroma.

The data is inside folder `games`. Each file will become a document in the collection you'll create.
Example.:
```json
{
  "Name": "Gran Turismo",
  "Platform": "PlayStation 1",
  "Genre": "Racing",
  "Publisher": "Sony Computer Entertainment",
  "Description": "A realistic racing simulator featuring a wide array of cars and tracks, setting a new standard for the genre.",
  "YearOfRelease": 1997
}
```

### Setup

In [None]:
# Only needed for Udacity workspace

import importlib.util
import sys

# Check if 'pysqlite3' is available before importing
if importlib.util.find_spec("pysqlite3") is not None:
    import pysqlite3
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
from typing import Dict, List, Any, Optional

In [None]:
# Load environment variables
load_dotenv()

# Verify API keys are loaded
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("Please set OPENAI_API_KEY in .env file")

print("✅ Environment variables loaded successfully")

### VectorDB Instance

In [None]:
# Instantiate ChromaDB Client - Simplest approach
import chromadb

# Create client without persistence (in-memory only)
chroma_client = chromadb.Client()
print("✅ ChromaDB client initialized (in-memory mode)")

### Collection

In [None]:
# Skip embedding function setup - using default
print("✅ Using default SentenceTransformer embeddings")

In [None]:
# Create collection
COLLECTION_NAME = "udaplay_games"

# Create new collection (using default embeddings)
collection = chroma_client.create_collection(name=COLLECTION_NAME)
print(f"✅ Created collection '{COLLECTION_NAME}'")

### Add documents

In [None]:
# Load and add game documents
data_dir = "games"
documents_added = 0
batch_size = 5  # Process in batches to avoid issues

# Process each game file
game_files = sorted([f for f in os.listdir(data_dir) if f.endswith(".json")])
print(f"Found {len(game_files)} game files to process")

# Prepare batch data
ids_batch = []
documents_batch = []
metadatas_batch = []

for file_name in game_files:
    file_path = os.path.join(data_dir, file_name)
    
    try:
        # Load game data
        with open(file_path, "r", encoding="utf-8") as f:
            game = json.load(f)
        
        # Create content for indexing
        content = (
            f"{game['Name']} is a {game.get('Genre', 'game')} game "
            f"released in {game['YearOfRelease']} for {game['Platform']}. "
            f"Published by {game.get('Publisher', 'Unknown')}. "
            f"{game['Description']}"
        )
        
        # Add to batch
        doc_id = os.path.splitext(file_name)[0]
        ids_batch.append(doc_id)
        documents_batch.append(content)
        metadatas_batch.append(game)
        
        # Process batch when full
        if len(ids_batch) >= batch_size:
            collection.add(
                ids=ids_batch,
                documents=documents_batch,
                metadatas=metadatas_batch
            )
            documents_added += len(ids_batch)
            print(f"  Processed {documents_added} documents...")
            
            # Clear batch
            ids_batch = []
            documents_batch = []
            metadatas_batch = []
            
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

# Process remaining documents
if ids_batch:
    collection.add(
        ids=ids_batch,
        documents=documents_batch,
        metadatas=metadatas_batch
    )
    documents_added += len(ids_batch)

print(f"✅ Successfully added {documents_added} documents to collection")

### Demonstrate Semantic Search

In [None]:
def semantic_search(query: str, n_results: int = 5) -> List[Dict[str, Any]]:
    """Perform semantic search on the game collection.
    
    Args:
        query: Search query
        n_results: Number of results to return
        
    Returns:
        List of search results with metadata
    """
    try:
        results = collection.query(
            query_texts=[query],
            n_results=n_results
        )
        
        # Format results
        formatted_results = []
        for i in range(len(results['ids'][0])):
            formatted_results.append({
                'id': results['ids'][0][i],
                'content': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'distance': results['distances'][0][i] if 'distances' in results else None
            })
        
        return formatted_results
        
    except Exception as e:
        print(f"Search failed: {e}")
        return []

In [None]:
# Demonstrate semantic search capabilities
test_queries = [
    "racing games",
    "games released in 2020",
    "Nintendo games",
    "RPG adventures"
]

print("🔍 Demonstrating Semantic Search Capabilities\n")
print("=" * 60)

for query in test_queries:
    print(f"\n📝 Query: '{query}'")
    print("-" * 40)
    
    results = semantic_search(query, n_results=3)
    
    if results:
        for i, result in enumerate(results, 1):
            metadata = result['metadata']
            print(f"\n  {i}. {metadata.get('Name', 'Unknown')} ({metadata.get('YearOfRelease', 'N/A')})")
            print(f"     Platform: {metadata.get('Platform', 'Unknown')}")
            print(f"     Genre: {metadata.get('Genre', 'Unknown')}")
            if result['distance']:
                print(f"     Relevance Score: {1 - result['distance']:.2%}")
    else:
        print("  No results found")

print("\n" + "=" * 60)
print("✅ Vector database is ready for the agent!")

### Production Features

In [None]:
# Collection management utilities
def get_collection_stats() -> Dict[str, Any]:
    """Get detailed statistics about the collection."""
    try:
        # Get all documents
        all_docs = collection.get()
        
        # Analyze metadata
        platforms = {}
        years = {}
        genres = {}
        
        for metadata in all_docs['metadatas']:
            # Count platforms
            platform = metadata.get('Platform', 'Unknown')
            platforms[platform] = platforms.get(platform, 0) + 1
            
            # Count years
            year = metadata.get('YearOfRelease', 'Unknown')
            years[str(year)] = years.get(str(year), 0) + 1
            
            # Count genres
            genre = metadata.get('Genre', 'Unknown')
            genres[genre] = genres.get(genre, 0) + 1
        
        return {
            'total_documents': collection.count(),
            'platforms': dict(sorted(platforms.items(), key=lambda x: x[1], reverse=True)),
            'years': dict(sorted(years.items())),
            'genres': dict(sorted(genres.items(), key=lambda x: x[1], reverse=True))
        }
        
    except Exception as e:
        print(f"Failed to get stats: {e}")
        return {}

# Display collection statistics
stats = get_collection_stats()
if stats:
    print("\n📈 Collection Statistics:")
    print(f"  Total Games: {stats['total_documents']}")
    print(f"  Unique Platforms: {len(stats['platforms'])}")
    print(f"  Year Range: {min(stats['years'].keys())} - {max(stats['years'].keys())}")
    print(f"  Genres: {', '.join(list(stats['genres'].keys())[:5])}...")