In [4]:
# RAG Setup - embeddings and vector store
PROJECT_ID = "avalon-conversational-agent"
BUCKET_NAME = f"{PROJECT_ID}-data"

from google.cloud import storage
import json

In [5]:
# Install embedding model and vector store
!pip install sentence-transformers chromadb -q

In [6]:
# Load RAG data from GCS
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)

def load_json_from_gcs(blob_path):
    blob = bucket.blob(blob_path)
    return json.loads(blob.download_as_string())

office_info = load_json_from_gcs("raw/office_info.json")
providers = load_json_from_gcs("raw/providers.json")
services = load_json_from_gcs("raw/services.json")
faqs = load_json_from_gcs("raw/faqs.json")

print(f"Loaded: office_info, providers, services, faqs")

Loaded: office_info, providers, services, faqs


In [7]:
# Convert structured data into text chunks for RAG
chunks = []

# Office info
for loc in office_info["locations"]:
    hours_text = ", ".join([f"{day}: {time}" for day, time in loc["hours"].items()])
    chunks.append({
        "id": f"location_{loc['name'].lower().replace(' ', '_')}",
        "text": f"{loc['name']} is located at {loc['address']}, {loc['city']}, {loc['state']} {loc['zip']}. Phone: {loc['phone']}. Hours: {hours_text}",
        "category": "location"
    })

# Contact info
chunks.append({
    "id": "contact_info",
    "text": f"Contact Avalon Dental by phone at {office_info['contact']['main_phone']}, text at {office_info['contact']['text_number']} (preferred), or email at {office_info['contact']['email']}.",
    "category": "contact"
})

# Policies
chunks.append({
    "id": "cancellation_policy",
    "text": f"Avalon Dental requires {office_info['cancellation_policy']['notice_required_days']} days notice for cancellations.",
    "category": "policy"
})

# Savings plan
sp = office_info["savings_plan"]
chunks.append({
    "id": "savings_plan",
    "text": f"The {sp['name']} costs ${sp['enrollment_fee']} to enroll. Benefits: {'. '.join(sp['benefits'])}. {sp['terms']}",
    "category": "plan"
})

# Providers
for p in providers["providers"]:
    chunks.append({
        "id": p["id"],
        "text": f"{p['name']} ({p['credentials']}) specializes in {', '.join(p['specialties'])}. {p.get('bio', '')}",
        "category": "provider"
    })

# Services
for s in services["services"]:
    price = f"${s['price_range']['min']}-${s['price_range']['max']}"
    chunks.append({
        "id": s["id"],
        "text": f"{s['name']}: {s['description']}. Duration: {s['duration_minutes']} minutes. Cost: {price}. {s.get('notes', '')}",
        "category": "service"
    })

# FAQs
for i, faq in enumerate(faqs["faqs"]):
    chunks.append({
        "id": f"faq_{i}",
        "text": f"Q: {faq['question']} A: {faq['answer']}",
        "category": "faq"
    })

print(f"Created {len(chunks)} chunks")

Created 44 chunks


In [8]:
# Load embedding model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Loaded embedding model: all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded embedding model: all-MiniLM-L6-v2


In [9]:
# Create ChromaDB collection and store embeddings
import chromadb

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="avalon_dental")

# Add all chunks to the collection
collection.add(
    ids=[c["id"] for c in chunks],
    documents=[c["text"] for c in chunks],
    metadatas=[{"category": c["category"]} for c in chunks]
)

print(f"Stored {collection.count()} chunks in ChromaDB")

/home/jupyter/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 64.5MiB/s]


Stored 44 chunks in ChromaDB


In [10]:
# Test RAG retrieval
query = "What are your hours?"
results = collection.query(query_texts=[query], n_results=3)

print(f"Query: {query}\n")
print("Retrieved chunks:")
for i, doc in enumerate(results["documents"][0]):
    print(f"\n{i+1}. {doc[:200]}...")

Query: What are your hours?

Retrieved chunks:

1. Q: What are your hours? A: Christiana is open Monday-Thursday 7:30 AM to 6:30 PM. Newport is open Monday-Thursday 8:00 AM to 5:00 PM. Both locations are closed Friday-Sunday....

2. Q: What is your cancellation policy? A: We require 2 days (48 hours) notice for cancellations. This helps us offer the time slot to other patients and maintains your eligibility for the Rewards Progra...

3. Braces Consultation: Evaluation for braces or clear aligners. Duration: 45 minutes. Cost: $0-$0. Complimentary consultation...


In [11]:
# Test a different query
query = "How much do implants cost?"
results = collection.query(query_texts=[query], n_results=3)

print(f"Query: {query}\n")
print("Retrieved chunks:")
for i, doc in enumerate(results["documents"][0]):
    print(f"\n{i+1}. {doc[:200]}...")

Query: How much do implants cost?

Retrieved chunks:

1. Dental Implant Placement: Surgical placement of implant fixture. Duration: 90 minutes. Cost: $1500-$2500. Requires healing period of 3-6 months before crown can be placed....

2. Implant Crown: Crown restoration on implant. Duration: 60 minutes. Cost: $1000-$1500. ...

3. Full Dentures: Complete upper or lower denture. Duration: 60 minutes. Cost: $1000-$2000. Requires multiple appointments for impressions, try-in, and delivery....


In [12]:
# Save ChromaDB to disk for later use
import chromadb

# Recreate with persistent storage
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.create_collection(name="avalon_dental")

collection.add(
    ids=[c["id"] for c in chunks],
    documents=[c["text"] for c in chunks],
    metadatas=[{"category": c["category"]} for c in chunks]
)

print(f"Saved {collection.count()} chunks to ./chroma_db")

Saved 44 chunks to ./chroma_db


In [13]:
# Upload ChromaDB folder to GCS
import os

def upload_folder_to_gcs(local_folder, gcs_prefix):
    for root, dirs, files in os.walk(local_folder):
        for file in files:
            local_path = os.path.join(root, file)
            gcs_path = os.path.join(gcs_prefix, os.path.relpath(local_path, local_folder))
            blob = bucket.blob(gcs_path)
            blob.upload_from_filename(local_path)
            print(f"Uploaded {gcs_path}")

upload_folder_to_gcs("./chroma_db", "chroma_db")
print("Done - ChromaDB saved to GCS")

Uploaded chroma_db/chroma.sqlite3
Uploaded chroma_db/da1e400a-70f6-4786-b323-188aae8b36fe/length.bin
Uploaded chroma_db/da1e400a-70f6-4786-b323-188aae8b36fe/link_lists.bin
Uploaded chroma_db/da1e400a-70f6-4786-b323-188aae8b36fe/data_level0.bin
Uploaded chroma_db/da1e400a-70f6-4786-b323-188aae8b36fe/header.bin
Done - ChromaDB saved to GCS
