In [2]:
# 05-gemini-finetuning.ipynb
# Openbook ML Demo - Gemini Fine-tuning for Treatment Plan Generation

"""
After predicting copays, we generate patient-friendly treatment plans.

Input: Patient name, procedures with predicted copays, insurance summary
Output: Professional treatment plan letter explaining costs, urgency, payment options

Fine-tuning approach:
1. Create synthetic training examples (input/output pairs)
2. Fine-tune Gemini via Vertex AI
3. Deploy tuned model for inference
"""

import vertexai
from vertexai.generative_models import GenerativeModel
from vertexai.tuning import sft
from google.cloud import storage
import pandas as pd
import json
import time

# Configuration
PROJECT_ID = "openbook-ml-demo"
REGION = "us-central1"
BUCKET_NAME = "openbook-data-lake"

vertexai.init(project=PROJECT_ID, location=REGION)

client = storage.Client(project=PROJECT_ID)
bucket = client.bucket(BUCKET_NAME)

print(f"Project: {PROJECT_ID}")
print(f"Region: {REGION}")
print("✓ Vertex AI initialized")

  from google.cloud.aiplatform.utils import gcs_utils


Project: openbook-ml-demo
Region: us-central1
✓ Vertex AI initialized


In [3]:
# Generate synthetic training data for treatment plan generation
# Input: Patient info + procedures + copays
# Output: Professional treatment plan letter

import random

# Sample patient names
first_names = ["John", "Sarah", "Michael", "Emily", "David", "Jennifer", "Robert", "Lisa", "William", "Maria"]
last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]

# Procedure info
procedures_info = {
    "D0120": {"name": "Periodic Evaluation", "urgency": "routine", "category": "preventive"},
    "D0274": {"name": "Bitewings (4 films)", "urgency": "routine", "category": "preventive"},
    "D1110": {"name": "Adult Cleaning", "urgency": "routine", "category": "preventive"},
    "D2391": {"name": "Filling (1 surface)", "urgency": "moderate", "category": "basic"},
    "D2392": {"name": "Filling (2 surfaces)", "urgency": "moderate", "category": "basic"},
    "D2750": {"name": "Crown", "urgency": "high", "category": "major"},
    "D2950": {"name": "Core Buildup", "urgency": "high", "category": "major"},
    "D3310": {"name": "Root Canal", "urgency": "urgent", "category": "major"},
    "D7140": {"name": "Extraction", "urgency": "urgent", "category": "basic"},
    "D5110": {"name": "Complete Denture", "urgency": "moderate", "category": "major"},
}

# Insurance carriers
carriers = ["Delta Dental", "Cigna", "Aetna", "MetLife", "Guardian"]
plan_types = ["PPO", "DHMO", "Indemnity"]

def generate_training_example():
    """Generate one input/output training pair"""
    
    # Random patient
    patient_name = f"{random.choice(first_names)} {random.choice(last_names)}"
    
    # Random insurance
    carrier = random.choice(carriers)
    plan_type = random.choice(plan_types)
    remaining_max = random.randint(200, 2000)
    
    # Random procedures (1-4)
    num_procs = random.randint(1, 4)
    selected_codes = random.sample(list(procedures_info.keys()), num_procs)
    
    procedures = []
    total_cost = 0
    total_copay = 0
    total_insurance = 0
    
    for code in selected_codes:
        info = procedures_info[code]
        cost = random.randint(50, 1500)
        copay = random.randint(0, int(cost * 0.7))
        insurance_pays = cost - copay
        
        procedures.append({
            "code": code,
            "name": info["name"],
            "urgency": info["urgency"],
            "cost": cost,
            "copay": copay,
            "insurance_pays": insurance_pays
        })
        
        total_cost += cost
        total_copay += copay
        total_insurance += insurance_pays
    
    # Create input
    input_text = f"""Patient: {patient_name}
Insurance: {carrier} {plan_type}
Remaining Annual Maximum: ${remaining_max}

Recommended Procedures:
"""
    for p in procedures:
        input_text += f"- {p['code']} {p['name']}: ${p['cost']} (You pay: ${p['copay']}, Insurance pays: ${p['insurance_pays']})\n"
    
    input_text += f"""
Total Cost: ${total_cost}
Your Estimated Cost: ${total_copay}
Insurance Pays: ${total_insurance}"""

    # Create output (treatment plan letter)
    urgency_text = ""
    urgent_procs = [p for p in procedures if p["urgency"] in ["urgent", "high"]]
    if urgent_procs:
        urgency_text = f"We recommend scheduling {urgent_procs[0]['name'].lower()} as soon as possible to prevent further complications."
    
    output_text = f"""Dear {patient_name},

Thank you for visiting our office. Based on your recent examination, Dr. Smith has recommended the following treatment plan.

TREATMENT SUMMARY:
"""
    for p in procedures:
        output_text += f"• {p['name']} ({p['code']}): ${p['cost']}\n"
        output_text += f"  Your estimated cost: ${p['copay']}\n"
    
    output_text += f"""
INSURANCE COVERAGE:
Your {carrier} {plan_type} plan will cover approximately ${total_insurance} of your treatment. Your remaining annual maximum is ${remaining_max}.

YOUR ESTIMATED TOTAL: ${total_copay}

{urgency_text}

PAYMENT OPTIONS:
- Pay in full at time of service (5% courtesy discount)
- Split into 2-3 monthly payments
- CareCredit financing available

Please call our office at (555) 123-4567 to schedule your appointment. We're happy to answer any questions about your treatment plan or payment options.

Sincerely,
Avalon Dental Team"""

    return {"input": input_text, "output": output_text}

# Generate training examples
training_data = [generate_training_example() for _ in range(100)]

print(f"✓ Generated {len(training_data)} training examples")
print("\n" + "=" * 60)
print("EXAMPLE INPUT:")
print("=" * 60)
print(training_data[0]["input"])
print("\n" + "=" * 60)
print("EXAMPLE OUTPUT:")
print("=" * 60)
print(training_data[0]["output"])

✓ Generated 100 training examples

EXAMPLE INPUT:
Patient: David Martinez
Insurance: Aetna PPO
Remaining Annual Maximum: $1587

Recommended Procedures:
- D5110 Complete Denture: $1478 (You pay: $504, Insurance pays: $974)
- D7140 Extraction: $1127 (You pay: $598, Insurance pays: $529)
- D2750 Crown: $1368 (You pay: $912, Insurance pays: $456)
- D2950 Core Buildup: $180 (You pay: $53, Insurance pays: $127)

Total Cost: $4153
Your Estimated Cost: $2067
Insurance Pays: $2086

EXAMPLE OUTPUT:
Dear David Martinez,

Thank you for visiting our office. Based on your recent examination, Dr. Smith has recommended the following treatment plan.

TREATMENT SUMMARY:
• Complete Denture (D5110): $1478
  Your estimated cost: $504
• Extraction (D7140): $1127
  Your estimated cost: $598
• Crown (D2750): $1368
  Your estimated cost: $912
• Core Buildup (D2950): $180
  Your estimated cost: $53

INSURANCE COVERAGE:
Your Aetna PPO plan will cover approximately $2086 of your treatment. Your remaining annual m

In [4]:
# Fix dataset format for Gemini 2.0
# Use "contents" format instead of "messages"

def format_for_gemini_2(example):
    return {
        "contents": [
            {"role": "user", "parts": [{"text": example["input"]}]},
            {"role": "model", "parts": [{"text": example["output"]}]}
        ]
    }

# Regenerate training data
training_data = [generate_training_example() for _ in range(100)]

# Convert to correct format
sft_data = [format_for_gemini_2(ex) for ex in training_data]

train_sft = sft_data[:80]
val_sft = sft_data[80:]

# Save as JSONL
with open('treatment_plan_train.jsonl', 'w') as f:
    for item in train_sft:
        f.write(json.dumps(item) + '\n')

with open('treatment_plan_val.jsonl', 'w') as f:
    for item in val_sft:
        f.write(json.dumps(item) + '\n')

# Upload to GCS
blob = bucket.blob('finetuning/treatment_plan_train.jsonl')
blob.upload_from_filename('treatment_plan_train.jsonl')

blob = bucket.blob('finetuning/treatment_plan_val.jsonl')
blob.upload_from_filename('treatment_plan_val.jsonl')

print(f"✓ Reformatted data uploaded")
print(f"  Training: {len(train_sft)} examples")
print(f"  Validation: {len(val_sft)} examples")

✓ Reformatted data uploaded
  Training: 80 examples
  Validation: 20 examples


In [12]:
# Start fine-tuning with correct format
from vertexai.tuning import sft

sft_tuning_job = sft.train(
    source_model="gemini-2.0-flash-001",
    train_dataset=f"gs://{BUCKET_NAME}/finetuning/treatment_plan_train.jsonl",
    validation_dataset=f"gs://{BUCKET_NAME}/finetuning/treatment_plan_val.jsonl",
    tuned_model_display_name="openbook-treatment-plan-v2",
    epochs=3,
)

print(f"✓ Fine-tuning job started")
print(f"Job ID: {sft_tuning_job.name}")
print(f"\nMonitor at:")
print(f"https://console.cloud.google.com/vertex-ai/generative/language/locations/{REGION}/tuning?project={PROJECT_ID}")

Creating SupervisedTuningJob
SupervisedTuningJob created. Resource name: projects/350248978874/locations/us-central1/tuningJobs/6093125431156801536
To use this SupervisedTuningJob in another session:
tuning_job = sft.SupervisedTuningJob('projects/350248978874/locations/us-central1/tuningJobs/6093125431156801536')
View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/6093125431156801536?project=350248978874


✓ Fine-tuning job started
Job ID: 6093125431156801536

Monitor at:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning?project=openbook-ml-demo


In [5]:
# Check fine-tuning status
job = list(sft.SupervisedTuningJob.list())[0]
print(f"State: {job._gca_resource.state}")

State: 4


In [6]:
job = list(sft.SupervisedTuningJob.list())[0]
print(job._gca_resource)

name: "projects/350248978874/locations/us-central1/tuningJobs/6093125431156801536"
tuned_model_display_name: "openbook-treatment-plan-v2"
base_model: "gemini-2.0-flash-001"
supervised_tuning_spec {
  training_dataset_uri: "gs://openbook-data-lake/finetuning/treatment_plan_train.jsonl"
  validation_dataset_uri: "gs://openbook-data-lake/finetuning/treatment_plan_val.jsonl"
  hyper_parameters {
    epoch_count: 3
    learning_rate_multiplier: 5
    adapter_size: ADAPTER_SIZE_FOUR
  }
}
state: JOB_STATE_SUCCEEDED
create_time {
  seconds: 1765819731
  nanos: 907224000
}
start_time {
  seconds: 1765819731
  nanos: 950756000
}
end_time {
  seconds: 1765821324
  nanos: 945324000
}
update_time {
  seconds: 1765821324
  nanos: 945324000
}
experiment: "projects/350248978874/locations/us-central1/metadataStores/default/contexts/tuning-experiment-20251215093235948658"
tuned_model {
  model: "projects/350248978874/locations/us-central1/models/4848519723538710528@1"
  endpoint: "projects/350248978874

In [7]:
# Test the fine-tuned model
from vertexai.generative_models import GenerativeModel

# Get the tuned model endpoint
tuned_model = GenerativeModel(
    "projects/350248978874/locations/us-central1/endpoints/2273604228775673856"
)

# Test with a sample input
test_input = """Patient: Sarah Johnson
Insurance: Delta Dental PPO
Remaining Annual Maximum: $1200

Recommended Procedures:
- D2750 Crown: $1100 (You pay: $440, Insurance pays: $660)
- D3310 Root Canal: $950 (You pay: $380, Insurance pays: $570)

Total Cost: $2050
Your Estimated Cost: $820
Insurance Pays: $1230"""

response = tuned_model.generate_content(test_input)
print("GENERATED TREATMENT PLAN:")
print("=" * 50)
print(response.text)



GENERATED TREATMENT PLAN:
Dear Sarah Johnson,

Thank you for visiting our office. Based on your recent examination, Dr. Avalos has recommended the following treatment plan.

CROWN (D2750): $1100
- Your estimated cost: $440
INSURANCE COVERAGE: $660

ROOT CANAL (D3310): $950
- Your estimated cost: $380
INSURANCE COVERAGE: $570

TOTAL: $2050
YOUR ESTIMATED TOTAL: $820

We recommend scheduling crown as soon as possible to prevent further complications. Root canal should be treated within the next 2-3 months.

INSURANCE SUMMARY:
- Dear Delta Dental PPO member,
Thank you for visiting our office. Based on your recent examination, Dr. Avalos has recommended the following treatment plan.

COVERED ROOT CANAL (D3310): $950
Your estimated cost: $380
INSURANCE COVERAGE: $570

REMAINING ANNUAL MAXIMUM: $1200

Please call our office at (555) 123-4567 to schedule crown appointment. We're happy to split into 2-3 payments.

Sincerely,
Avalon Dental Team


In [8]:
# Save tuned model info
tuned_model_config = {
    "base_model": "gemini-2.0-flash-001",
    "tuned_model": "projects/350248978874/locations/us-central1/models/4848519723538710528@1",
    "endpoint": "projects/350248978874/locations/us-central1/endpoints/2273604228775673856",
    "training_examples": 80,
    "epochs": 3
}

with open('tuned_model_config.json', 'w') as f:
    json.dump(tuned_model_config, f, indent=2)

blob = bucket.blob('models/tuned_model_config.json')
blob.upload_from_filename('tuned_model_config.json')

print("✓ Tuned model config saved to GCS")
print(f"\nEndpoint: {tuned_model_config['endpoint']}")

print("\n" + "=" * 60)
print("NOTEBOOK 05 COMPLETE - GEMINI FINE-TUNING")
print("=" * 60)
print("\n✓ Generated 100 synthetic training examples")
print("✓ Fine-tuned Gemini 2.0 Flash on treatment plan generation")
print("✓ Model generates professional patient letters from copay predictions")
print("\nNext: TFX Pipelines (notebook 06)")

✓ Tuned model config saved to GCS

Endpoint: projects/350248978874/locations/us-central1/endpoints/2273604228775673856

NOTEBOOK 05 COMPLETE - GEMINI FINE-TUNING

✓ Generated 100 synthetic training examples
✓ Fine-tuned Gemini 2.0 Flash on treatment plan generation
✓ Model generates professional patient letters from copay predictions

Next: TFX Pipelines (notebook 06)
