In [1]:
# 03-feature-store-setup.ipynb
# Openbook ML Demo - Vertex AI Feature Store Setup

"""
Feature Store provides:
1. Centralized feature storage with entity keys
2. Online serving (low-latency for real-time inference)
3. Offline serving (batch for training)
4. Feature versioning and lineage
5. Training-serving consistency (no skew)

Flow:
- Dataflow created features → GCS
- This notebook → Feature Store ingestion
- Training reads from Feature Store (offline)
- Inference reads from Feature Store (online)
"""

from google.cloud import aiplatform
from google.cloud import storage
from google.cloud.aiplatform_v1 import FeaturestoreServiceClient
from google.cloud.aiplatform_v1 import FeatureOnlineStoreServiceClient
import pandas as pd
from io import StringIO

# Configuration
PROJECT_ID = "openbook-ml-demo"
REGION = "us-central1"
BUCKET_NAME = "openbook-data-lake"

# Initialize Vertex AI
aiplatform.init(project=PROJECT_ID, location=REGION)

# GCS client
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)

print(f"Project: {PROJECT_ID}")
print(f"Region: {REGION}")
print("\n✓ Vertex AI initialized")

  from google.cloud.aiplatform.utils import gcs_utils


Project: openbook-ml-demo
Region: us-central1

✓ Vertex AI initialized


In [2]:
# Create Feature Store (Online Store for low-latency serving)

from google.cloud.aiplatform_v1 import FeatureOnlineStoreAdminServiceClient
from google.cloud.aiplatform_v1.types import feature_online_store as feature_online_store_pb2
from google.cloud.aiplatform_v1.types import feature_online_store_admin_service as admin_pb2

# Create Feature Online Store
admin_client = FeatureOnlineStoreAdminServiceClient(
    client_options={"api_endpoint": f"{REGION}-aiplatform.googleapis.com"}
)

parent = f"projects/{PROJECT_ID}/locations/{REGION}"

# Define online store configuration
online_store_config = feature_online_store_pb2.FeatureOnlineStore(
    bigtable=feature_online_store_pb2.FeatureOnlineStore.Bigtable(
        auto_scaling=feature_online_store_pb2.FeatureOnlineStore.Bigtable.AutoScaling(
            min_node_count=1,
            max_node_count=1,  # Keep minimal for demo
            cpu_utilization_target=80,
        )
    )
)

# Create the store
try:
    operation = admin_client.create_feature_online_store(
        parent=parent,
        feature_online_store_id="openbook_feature_store",
        feature_online_store=online_store_config,
    )
    print("Creating Feature Online Store (this takes 5-10 minutes)...")
    result = operation.result()  # Wait for completion
    print(f"✓ Created: {result.name}")
except Exception as e:
    if "already exists" in str(e):
        print("✓ Feature Online Store already exists")
    else:
        raise e

Creating Feature Online Store (this takes 5-10 minutes)...
✓ Created: projects/350248978874/locations/us-central1/featureOnlineStores/openbook_feature_store


In [3]:
# Create Feature View (defines which features to serve)
# Feature Views connect GCS/BigQuery data to the online store

from google.cloud.aiplatform_v1.types import feature_view as feature_view_pb2

# First, load our features to understand schema
blob = bucket.blob('processed/features/train.csv')
content = blob.download_as_text()
train_df = pd.read_csv(StringIO(content))

print(f"Loaded {len(train_df)} training samples")
print(f"Columns: {train_df.columns.tolist()}")

# For Feature Store, we need to upload to BigQuery first (Feature Store reads from BQ)
# Let's create BigQuery dataset and table

from google.cloud import bigquery

bq_client = bigquery.Client(project=PROJECT_ID)

# Create dataset
dataset_id = f"{PROJECT_ID}.openbook_features"
dataset = bigquery.Dataset(dataset_id)
dataset.location = REGION

try:
    bq_client.create_dataset(dataset, exists_ok=True)
    print(f"✓ Created BigQuery dataset: {dataset_id}")
except Exception as e:
    print(f"Dataset exists or error: {e}")

# Upload features to BigQuery
table_id = f"{dataset_id}.claim_features"

# Add entity_id column (required for Feature Store)
train_df['entity_id'] = train_df['claim_id']
# Add feature_timestamp (required for Feature Store)
train_df['feature_timestamp'] = pd.Timestamp.now()

# Upload to BigQuery
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
job = bq_client.load_table_from_dataframe(train_df, table_id, job_config=job_config)
job.result()

print(f"✓ Uploaded {len(train_df)} rows to {table_id}")

Loaded 7000 training samples
Columns: ['claim_id', 'patient_id', 'procedure_code', 'procedure_cost', 'annual_maximum', 'remaining_maximum', 'deductible_remaining', 'coverage_ratio', 'months_enrolled', 'max_utilization', 'cost_to_max_ratio', 'deductible_applies', 'network_penalty', 'waiting_period_risk', 'is_preventive', 'is_basic', 'is_major', 'high_cost_procedure', 'is_ppo', 'is_dhmo', 'is_indemnity', 'carrier_delta_dental', 'carrier_cigna', 'carrier_aetna', 'carrier_metlife', 'carrier_guardian', 'expected_copay', 'copay_deviation', 'patient_copay']
✓ Created BigQuery dataset: openbook-ml-demo.openbook_features
✓ Uploaded 7000 rows to openbook-ml-demo.openbook_features.claim_features


In [7]:
# Fix timestamp column type and re-upload to BigQuery

# Reload data
blob = bucket.blob('processed/features/train.csv')
content = blob.download_as_text()
train_df = pd.read_csv(StringIO(content))

# Add entity_id
train_df['entity_id'] = train_df['claim_id']

# Add feature_timestamp as proper TIMESTAMP (not datetime)
# Feature Store needs this as a string in ISO format
train_df['feature_timestamp'] = pd.Timestamp.now(tz='UTC').isoformat()

# Drop the timestamp column - Feature Store will use a default
# Or we rename to avoid the reserved column issue
train_df = train_df.drop(columns=['feature_timestamp'])

# Re-upload to BigQuery without timestamp (Feature Store handles it)
table_id = f"{PROJECT_ID}.openbook_features.claim_features"

job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
job = bq_client.load_table_from_dataframe(train_df, table_id, job_config=job_config)
job.result()

print(f"✓ Re-uploaded {len(train_df)} rows to BigQuery (without timestamp column)")
print("\nColumns:")
print(train_df.columns.tolist())

✓ Re-uploaded 7000 rows to BigQuery (without timestamp column)

Columns:
['claim_id', 'patient_id', 'procedure_code', 'procedure_cost', 'annual_maximum', 'remaining_maximum', 'deductible_remaining', 'coverage_ratio', 'months_enrolled', 'max_utilization', 'cost_to_max_ratio', 'deductible_applies', 'network_penalty', 'waiting_period_risk', 'is_preventive', 'is_basic', 'is_major', 'high_cost_procedure', 'is_ppo', 'is_dhmo', 'is_indemnity', 'carrier_delta_dental', 'carrier_cigna', 'carrier_aetna', 'carrier_metlife', 'carrier_guardian', 'expected_copay', 'copay_deviation', 'patient_copay', 'entity_id']


In [9]:
# Retry Feature View creation

feature_view_config = feature_view_pb2.FeatureView(
    big_query_source=feature_view_pb2.FeatureView.BigQuerySource(
        uri=f"bq://{PROJECT_ID}.openbook_features.claim_features",
        entity_id_columns=["entity_id"],
    ),
    sync_config=feature_view_pb2.FeatureView.SyncConfig(
        cron="0 0 * * *",
    ),
)

online_store_path = f"{parent}/featureOnlineStores/openbook_feature_store"

print("Creating Feature View...")
try:
    operation = admin_client.create_feature_view(
        parent=online_store_path,
        feature_view_id="claim_features_view",
        feature_view=feature_view_config,
    )
    print("Waiting for operation (2-5 min)...")
    result = operation.result(timeout=600)
    print(f"✓ Created Feature View: {result.name}")
except Exception as e:
    print(f"✗ Error: {type(e).__name__}")
    print(f"  Message: {e}")

Creating Feature View...
✗ Error: AlreadyExists
  Message: 409 FeatureView `projects/350248978874/locations/us-central1/featureOnlineStores/openbook_feature_store/featureViews/claim_features_view` already exists.


In [10]:
# Verify Feature View exists and check status

feature_view_path = f"{online_store_path}/featureViews/claim_features_view"

fv = admin_client.get_feature_view(name=feature_view_path)
print(f"✓ Name: {fv.name}")
print(f"✓ BigQuery Source: {fv.big_query_source.uri}")
print(f"✓ Entity ID Column: {fv.big_query_source.entity_id_columns}")

# Trigger manual sync to populate online store
sync_response = admin_client.sync_feature_view(feature_view=feature_view_path)
print(f"✓ Sync started: {sync_response.feature_view_sync}")

✓ Name: projects/350248978874/locations/us-central1/featureOnlineStores/openbook_feature_store/featureViews/claim_features_view
✓ BigQuery Source: bq://openbook-ml-demo.openbook_features.claim_features
✓ Entity ID Column: ['entity_id']
✓ Sync started: projects/openbook-ml-demo/locations/us-central1/featureOnlineStores/openbook_feature_store/featureViews/claim_features_view/featureViewSyncs/1327014692530421760


In [16]:
# Inspect response structure
response = online_client.fetch_feature_values(
    request=feature_online_store_service.FetchFeatureValuesRequest(
        feature_view=feature_view_path,
        data_key=feature_online_store_service.FeatureViewDataKey(key="CLM_000301"),
        data_format=feature_online_store_service.FeatureViewDataFormat.KEY_VALUE,
    )
)

print("✓ Online Feature Serving Test")
print("=" * 50)
print(f"Entity ID: CLM_000301")
print(f"\nResponse type: {type(response)}")
print(f"\nResponse:\n{response}")

✓ Online Feature Serving Test
Entity ID: CLM_000301

Response type: <class 'google.cloud.aiplatform_v1.types.feature_online_store_service.FetchFeatureValuesResponse'>

Response:
key_values {
  features {
    name: "claim_id"
    value {
      string_value: "CLM_000301"
    }
  }
  features {
    name: "patient_id"
    value {
      string_value: "PAT_0677"
    }
  }
  features {
    name: "procedure_code"
    value {
      string_value: "D0120"
    }
  }
  features {
    name: "procedure_cost"
    value {
      double_value: 65
    }
  }
  features {
    name: "annual_maximum"
    value {
      double_value: 1500
    }
  }
  features {
    name: "remaining_maximum"
    value {
      double_value: 75
    }
  }
  features {
    name: "deductible_remaining"
    value {
      double_value: 0
    }
  }
  features {
    name: "coverage_ratio"
    value {
      double_value: 1
    }
  }
  features {
    name: "months_enrolled"
    value {
      int64_value: 24
    }
  }
  features {
    name:

In [17]:
# Feature Store Summary
print("✓ Feature Store Setup Complete")
print("=" * 50)
print(f"Online Store: openbook_feature_store")
print(f"Feature View: claim_features_view")
print(f"Source: BigQuery (openbook_features.claim_features)")
print(f"Features: 29")
print(f"Entities: 7,000 (training set)")
print(f"\nOnline serving: Bigtable-backed, millisecond latency")
print(f"Offline serving: BigQuery for batch training")

✓ Feature Store Setup Complete
Online Store: openbook_feature_store
Feature View: claim_features_view
Source: BigQuery (openbook_features.claim_features)
Features: 29
Entities: 7,000 (training set)

Online serving: Bigtable-backed, millisecond latency
Offline serving: BigQuery for batch training
