# Verily Workbench Setup

Run this notebook once at the start of each JupyterLab session to initialize workspace environment variables.

**This notebook will:**
1. Create workspace GCS buckets if they don't exist (first-time setup only)
2. Set environment variables for the current session

**Environment variables set:**
- `GOOGLE_CLOUD_PROJECT` - Your Google Cloud project ID
- `WORKSPACE_CDR` - BigQuery dataset with OMOP CDR data
- `WORKSPACE_BUCKET` - Persistent workspace GCS bucket
- `WORKSPACE_TEMP_BUCKET` - Temporary workspace GCS bucket (auto-deleted after 14 days)

**Time to run:** ~2-5 seconds (longer on first run if creating buckets)

In [None]:
import os
import json
import subprocess
from typing import Dict

In [None]:
def ensure_workspace_buckets(verbose: bool = True) -> None:
    """
    Create workspace GCS buckets if they don't exist.

    Creates two buckets:
    - workspace-bucket: Persistent storage for analysis outputs
    - temporary-workspace-bucket: Auto-deleted after 14 days

    Args:
        verbose: If True, print creation status
    """
    buckets_config = [
        {
            'name': 'workspace-bucket',
            'description': 'Primary workspace bucket for storing files',
            'auto_delete_days': None
        },
        {
            'name': 'temporary-workspace-bucket',
            'description': 'Bucket for temporary storage. Auto-cleanup after 14 days.',
            'auto_delete_days': 14
        }
    ]

    for bucket in buckets_config:
        # Try to resolve existing bucket
        resolve_result = subprocess.run(
            ['wb', 'resource', 'resolve', '--name', bucket['name']],
            capture_output=True, text=True
        )

        if resolve_result.returncode != 0:
            # Bucket doesn't exist, create it
            create_cmd = [
                'wb', 'resource', 'create', 'gcs-bucket',
                '--name', bucket['name'],
                '--cloning', 'COPY_NOTHING',
                '--description', bucket['description']
            ]

            if bucket['auto_delete_days']:
                create_cmd.extend(['--auto-delete', str(bucket['auto_delete_days'])])

            subprocess.run(create_cmd, check=True, capture_output=True, text=True)

            if verbose:
                print(f"‚úÖ Created bucket: {bucket['name']}")
        else:
            if verbose:
                print(f"‚úÖ Found existing bucket: {bucket['name']}")

In [None]:
def setup_aou_env(verbose: bool = True) -> Dict[str, str]:
    """
    Set All of Us workspace environment variables using wb CLI.

    Creates workspace GCS buckets if they don't exist, then sets environment
    variables by extracting workspace configuration dynamically.

    Args:
        verbose: If True, print variables as they're set

    Returns:
        dict: Environment variables that were set

    Raises:
        subprocess.CalledProcessError: If wb CLI commands fail
    """
    # Ensure workspace buckets exist (creates them if needed)
    ensure_workspace_buckets(verbose=verbose)

    # Extract workspace info
    workspace = json.loads(
        subprocess.run(
            ["wb", "workspace", "describe", "--format=json"],
            capture_output=True, text=True, check=True
        ).stdout
    )

    # Extract resources
    resources = json.loads(
        subprocess.run(
            ["wb", "resource", "list", "--format=json"],
            capture_output=True, text=True, check=True
        ).stdout
    )

    # Set Google Cloud project
    os.environ["GOOGLE_CLOUD_PROJECT"] = workspace["googleProjectId"]

    # Initialize CDR (will be set below)
    os.environ["WORKSPACE_CDR"] = ""

    # Set buckets and CDR from resources list
    for r in resources:
        if r["resourceType"] == "GCS_BUCKET":
            # Check temporary bucket first to avoid substring conflicts
            if "temporary-workspace-bucket" in r["id"]:
                os.environ["WORKSPACE_TEMP_BUCKET"] = f"gs://{r['bucketName']}"
            elif "workspace-bucket" in r["id"]:
                os.environ["WORKSPACE_BUCKET"] = f"gs://{r['bucketName']}"

        elif r["resourceType"] in ["BQ_DATASET", "BIGQUERY_DATASET"]:
            # Only set CDR if not already set (use first found)
            if os.environ.get("WORKSPACE_CDR") == "":
                os.environ["WORKSPACE_CDR"] = f"{r['projectId']}.{r['datasetId']}"

    # Collect variables for return
    env_vars = {
        "GOOGLE_CLOUD_PROJECT": os.environ.get("GOOGLE_CLOUD_PROJECT"),
        "WORKSPACE_BUCKET": os.environ.get("WORKSPACE_BUCKET"),
        "WORKSPACE_TEMP_BUCKET": os.environ.get("WORKSPACE_TEMP_BUCKET"),
        "WORKSPACE_CDR": os.environ.get("WORKSPACE_CDR")
    }

    if verbose:
        print("‚úÖ Workspace environment variables set:")
        for key, val in env_vars.items():
            print(f"  {key} = {val}")

    return env_vars

In [None]:
# Execute setup with verbose output
env = setup_aou_env(verbose=True)

In [None]:
# Assign to Python variables for easy access
WORKSPACE_CDR = os.environ['WORKSPACE_CDR']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
WORKSPACE_TEMP_BUCKET = os.environ['WORKSPACE_TEMP_BUCKET']
GOOGLE_CLOUD_PROJECT = os.environ['GOOGLE_CLOUD_PROJECT']

print("\n‚úÖ Variables ready for analysis:")
print(f"  CDR: {WORKSPACE_CDR}")
print(f"  Bucket: {WORKSPACE_BUCKET}")
print(f"  Temp Bucket: {WORKSPACE_TEMP_BUCKET}")
print(f"  Project: {GOOGLE_CLOUD_PROJECT}")

## Usage in Analysis Notebooks

These variables are now available in `os.environ` for the current session.

In other notebooks, access them with:

```python
import os

WORKSPACE_CDR = os.environ['WORKSPACE_CDR']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
WORKSPACE_TEMP_BUCKET = os.environ['WORKSPACE_TEMP_BUCKET']
```

**Important: Validate Setup Before Using Variables**

Add this validation code at the start of any analysis notebook to ensure setup has been run:

```python
import os

# Validate workspace setup
required_vars = ['WORKSPACE_CDR', 'WORKSPACE_BUCKET', 'WORKSPACE_TEMP_BUCKET', 'GOOGLE_CLOUD_PROJECT']
missing_vars = [var for var in required_vars if not os.environ.get(var)]

if missing_vars:
    raise RuntimeError(
        f"‚ùå Workspace not initialized! Missing environment variables: {', '.join(missing_vars)}\n\n"
        "üëâ Please run the setup notebook first:\n"
        "   _reference/verily/00_setup_workspace.ipynb\n\n"
        "Then come back and re-run this cell."
    )

# Now safe to access variables
WORKSPACE_CDR = os.environ['WORKSPACE_CDR']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
WORKSPACE_TEMP_BUCKET = os.environ['WORKSPACE_TEMP_BUCKET']
print("‚úÖ Workspace variables loaded successfully")
```

**Note**: You must run the setup notebook in each new JupyterLab session.