In [1]:
# Setup - project config and imports
PROJECT_ID = "avalon-conversational-agent"
REGION = "us-central1"
BUCKET_NAME = f"{PROJECT_ID}-data"

from google.cloud import storage
import json
import os

In [2]:
# Create GCS bucket for project data
storage_client = storage.Client(project=PROJECT_ID)

try:
    bucket = storage_client.get_bucket(BUCKET_NAME)
    print(f"Bucket {BUCKET_NAME} already exists")
except:
    bucket = storage_client.create_bucket(BUCKET_NAME, location=REGION)
    print(f"Created bucket {BUCKET_NAME}")

Bucket avalon-conversational-agent-data already exists


In [3]:
# Verify local files exist
LOCAL_DATA_DIR = "data"
expected_files = [
    "office_info.json",
    "providers.json", 
    "services.json",
    "conversations.json",
    "faqs.json"
]

for f in expected_files:
    path = os.path.join(LOCAL_DATA_DIR, f)
    if os.path.exists(path):
        print(f"✓ {f}")
    else:
        print(f"✗ {f} - MISSING")

✓ office_info.json
✓ providers.json
✓ services.json
✓ conversations.json
✓ faqs.json


In [4]:
# Upload files to GCS (RAG data to raw/, training data to training/)
def upload_to_gcs(local_path, gcs_path):
    blob = bucket.blob(gcs_path)
    blob.upload_from_filename(local_path)
    print(f"Uploaded {local_path} -> gs://{BUCKET_NAME}/{gcs_path}")

for f in expected_files:
    local_path = os.path.join(LOCAL_DATA_DIR, f)
    if f == "conversations.json":
        gcs_path = f"training/{f}"
    else:
        gcs_path = f"raw/{f}"
    upload_to_gcs(local_path, gcs_path)

Uploaded data/office_info.json -> gs://avalon-conversational-agent-data/raw/office_info.json
Uploaded data/providers.json -> gs://avalon-conversational-agent-data/raw/providers.json
Uploaded data/services.json -> gs://avalon-conversational-agent-data/raw/services.json
Uploaded data/conversations.json -> gs://avalon-conversational-agent-data/training/conversations.json
Uploaded data/faqs.json -> gs://avalon-conversational-agent-data/raw/faqs.json


In [5]:
# Verify files in bucket
print("Files in bucket:")
for blob in bucket.list_blobs():
    print(f"  gs://{BUCKET_NAME}/{blob.name}")

Files in bucket:
  gs://avalon-conversational-agent-data/raw/faqs.json
  gs://avalon-conversational-agent-data/raw/office_info.json
  gs://avalon-conversational-agent-data/raw/providers.json
  gs://avalon-conversational-agent-data/raw/services.json
  gs://avalon-conversational-agent-data/training/conversations.json


In [6]:
# Data summary
with open(f"{LOCAL_DATA_DIR}/office_info.json") as f:
    office_info = json.load(f)

with open(f"{LOCAL_DATA_DIR}/services.json") as f:
    services_data = json.load(f)

with open(f"{LOCAL_DATA_DIR}/conversations.json") as f:
    conversations = json.load(f)

with open(f"{LOCAL_DATA_DIR}/faqs.json") as f:
    faqs = json.load(f)

print(f"Locations: {len(office_info['locations'])}")
print(f"Services: {len(services_data['services'])}")
print(f"Conversations (for LoRA): {len(conversations['conversations'])}")
print(f"FAQs: {len(faqs['faqs'])}")

Locations: 2
Services: 19
Conversations (for LoRA): 30
FAQs: 18


In [7]:
# Sample conversation for LoRA training
sample = conversations["conversations"][0]
print(f"Category: {sample['category']}\n")
for msg in sample["messages"]:
    role = "Patient" if msg["role"] == "patient" else "Avalon"
    print(f"{role}: {msg['content']}\n")

Category: appointment_scheduling

Patient: Hi, I'd like to schedule a cleaning appointment

Avalon: Hi! I'd be happy to help you schedule a cleaning. Do you have a preference for our Christiana or Newport location?

Patient: Christiana works better for me

Avalon: Great! We have openings this week on Thursday at 9:30 AM or 2:00 PM. We also have Monday at 9:00 AM or 10:00 AM. Which works best for your schedule?

Patient: Thursday at 2 works

Avalon: Perfect! I've got you down for Thursday at 2:00 PM at our Christiana location for a cleaning with our hygienist. We're located at 430 Christiana Medical Center, across from the Christiana Mall. Please remember we require 2 days notice for any cancellations. See you Thursday!

