# 01 — BigQuery Schema Setup
**Lumina Forecasting Hub**

This notebook creates the BigQuery dataset and all dimension + fact tables.
Run this **once** before any ingestion notebooks.

**Tables Created:**
- `dim_balancing_authority` — One row per BA
- `dim_fuel_type` — One row per fuel code
- `dim_geography` — One row per US state
- `dim_date` — One row per date (2010-01-01 → 2027-12-31)
- `fact_hourly_demand` — BA × hour
- `fact_monthly_generation` — State × fuel × month
- `fact_retail_sales` — State × sector × month
- `fact_carbon_emissions` — State × source × year

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
import pandas as pd
import sys, os

# ── Config ──────────────────────────────────────────────────────────
GCP_PROJECT_ID = "YOUR_GCP_PROJECT_ID"   # <-- UPDATE THIS
BQ_DATASET     = "lumina"
BQ_LOCATION    = "US"

client = bigquery.Client(project=GCP_PROJECT_ID)
print(f"Connected to BigQuery project: {GCP_PROJECT_ID}")

In [None]:
dataset_ref = bigquery.DatasetReference(GCP_PROJECT_ID, BQ_DATASET)
dataset = bigquery.Dataset(dataset_ref)
dataset.location = BQ_LOCATION
dataset.description = "Lumina Forecasting Hub — EIA energy analytics"

dataset = client.create_dataset(dataset, exists_ok=True)
print(f"Dataset '{BQ_DATASET}' ready in {BQ_LOCATION}")

In [None]:
DDL_STATEMENTS = [
    # ── dim_balancing_authority ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.dim_balancing_authority` (
        ba_code           STRING NOT NULL,
        ba_name           STRING,
        region            STRING,
        timezone          STRING,
        peak_capacity_mw  FLOAT64,
        PRIMARY KEY (ba_code) NOT ENFORCED
    )
    OPTIONS (
        description = 'Balancing Authority dimension — one row per BA/RTO/ISO'
    )
    """,

    # ── dim_fuel_type ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.dim_fuel_type` (
        fuel_code              STRING NOT NULL,
        fuel_label             STRING,
        is_renewable           BOOL,
        emission_factor_kg_mwh FLOAT64,
        PRIMARY KEY (fuel_code) NOT ENFORCED
    )
    OPTIONS (
        description = 'Fuel type dimension with emission factors'
    )
    """,

    # ── dim_geography ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.dim_geography` (
        state_code    STRING NOT NULL,
        state_name    STRING,
        census_region STRING,
        census_division STRING,
        latitude      FLOAT64,
        longitude     FLOAT64,
        population    INT64,
        PRIMARY KEY (state_code) NOT ENFORCED
    )
    OPTIONS (
        description = 'US state geography dimension'
    )
    """,

    # ── dim_date ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.dim_date` (
        date_key      DATE NOT NULL,
        year          INT64,
        quarter       INT64,
        month         INT64,
        month_name    STRING,
        day_of_week   INT64,
        day_name      STRING,
        is_weekend    BOOL,
        is_peak_month BOOL,
        PRIMARY KEY (date_key) NOT ENFORCED
    )
    OPTIONS (
        description = 'Date dimension — daily grain from 2010 to 2027'
    )
    """,
]

for ddl in DDL_STATEMENTS:
    client.query(ddl).result()
print("Dimension tables created.")

In [None]:
FACT_DDL = [
    # ── fact_hourly_demand ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_hourly_demand` (
        timestamp_utc       TIMESTAMP NOT NULL,
        ba_code             STRING NOT NULL,
        demand_mw           FLOAT64,
        demand_forecast_mw  FLOAT64,
        net_generation_mw   FLOAT64,
        interchange_mw      FLOAT64,
        forecast_error_mw   FLOAT64,
        forecast_error_pct  FLOAT64,
        ingested_at         TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
    )
    PARTITION BY DATE(timestamp_utc)
    CLUSTER BY ba_code
    OPTIONS (
        description = 'Hourly grid operations from EIA-930 / electricity/rto'
    )
    """,

    # ── fact_monthly_generation ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_monthly_generation` (
        period_month    DATE NOT NULL,
        state_code      STRING NOT NULL,
        fuel_code       STRING NOT NULL,
        sector_code     STRING,
        generation_mwh  FLOAT64,
        ingested_at     TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
    )
    PARTITION BY DATE_TRUNC(period_month, MONTH)
    CLUSTER BY state_code, fuel_code
    OPTIONS (
        description = 'Monthly electricity generation by state, fuel, and sector'
    )
    """,

    # ── fact_retail_sales ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_retail_sales` (
        period_month    DATE NOT NULL,
        state_code      STRING NOT NULL,
        sector_code     STRING NOT NULL,
        revenue_musd    FLOAT64,
        sales_mwh       FLOAT64,
        price_cents_kwh FLOAT64,
        customers       INT64,
        ingested_at     TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
    )
    PARTITION BY DATE_TRUNC(period_month, MONTH)
    CLUSTER BY state_code, sector_code
    OPTIONS (
        description = 'Monthly retail electricity sales — price, revenue, customers'
    )
    """,

    # ── fact_carbon_emissions ──
    f"""
    CREATE TABLE IF NOT EXISTS `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_carbon_emissions` (
        period_year     INT64 NOT NULL,
        state_code      STRING NOT NULL,
        source_code     STRING NOT NULL,
        sector_code     STRING NOT NULL,
        emissions_mmt   FLOAT64,
        ingested_at     TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
    )
    CLUSTER BY state_code, source_code
    OPTIONS (
        description = 'Annual CO2 emissions from SEDS by state, source, and sector'
    )
    """,
]

for ddl in FACT_DDL:
    client.query(ddl).result()
print("Fact tables created.")

In [None]:
ba_data = [
    {"ba_code": "PJM",  "ba_name": "PJM Interconnection",       "region": "Mid-Atlantic",  "timezone": "US/Eastern",  "peak_capacity_mw": 185000},
    {"ba_code": "MISO", "ba_name": "Midcontinent ISO",          "region": "Midwest",       "timezone": "US/Central",  "peak_capacity_mw": 130000},
    {"ba_code": "ERCO", "ba_name": "ERCOT",                     "region": "Texas",         "timezone": "US/Central",  "peak_capacity_mw": 85000},
    {"ba_code": "CISO", "ba_name": "California ISO",            "region": "West",          "timezone": "US/Pacific",  "peak_capacity_mw": 52000},
    {"ba_code": "ISNE", "ba_name": "ISO New England",           "region": "New England",   "timezone": "US/Eastern",  "peak_capacity_mw": 30000},
    {"ba_code": "NYIS", "ba_name": "New York ISO",              "region": "New York",      "timezone": "US/Eastern",  "peak_capacity_mw": 38000},
    {"ba_code": "SWPP", "ba_name": "Southwest Power Pool",      "region": "Central",       "timezone": "US/Central",  "peak_capacity_mw": 55000},
    {"ba_code": "SOCO", "ba_name": "Southern Company",          "region": "Southeast",     "timezone": "US/Eastern",  "peak_capacity_mw": 47000},
    {"ba_code": "TVA",  "ba_name": "Tennessee Valley Authority", "region": "Southeast",    "timezone": "US/Central",  "peak_capacity_mw": 37000},
    {"ba_code": "DUK",  "ba_name": "Duke Energy Carolinas",     "region": "Southeast",     "timezone": "US/Eastern",  "peak_capacity_mw": 24000},
]

df_ba = pd.DataFrame(ba_data)
table_ref = f"{GCP_PROJECT_ID}.{BQ_DATASET}.dim_balancing_authority"
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
client.load_table_from_dataframe(df_ba, table_ref, job_config=job_config).result()
print(f"Seeded dim_balancing_authority: {len(df_ba)} rows")

In [None]:
FUEL_TYPE_MAP = {
    "SUN": ("Solar",              True,  0),
    "WND": ("Wind",               True,  0),
    "WAT": ("Hydro",              True,  0),
    "NUC": ("Nuclear",            False, 0),
    "NG":  ("Natural Gas",        False, 411),
    "COL": ("Coal",               False, 910),
    "PET": ("Petroleum",          False, 700),
    "OTH": ("Other",              False, 300),
    "GEO": ("Geothermal",         True,  0),
    "BIO": ("Biomass",            True,  0),
    "WAS": ("Waste",              False, 500),
    "OOG": ("Other Gas",          False, 450),
    "SPV": ("Solar PV",           True,  0),
    "STH": ("Solar Thermal",      True,  0),
    "DPV": ("Distributed PV",     True,  0),
    "HYC": ("Conventional Hydro", True,  0),
    "HPS": ("Hydro Pumped Stor.", False, 0),
    "WWW": ("Wood & Wood Waste",  True,  0),
    "TSN": ("All Solar",          True,  0),
    "AOR": ("All Renewables",     True,  0),
    "ALL": ("All Fuels",          False, None),
}

fuel_rows = [
    {"fuel_code": k, "fuel_label": v[0], "is_renewable": v[1], "emission_factor_kg_mwh": v[2]}
    for k, v in FUEL_TYPE_MAP.items()
]
df_fuel = pd.DataFrame(fuel_rows)
table_ref = f"{GCP_PROJECT_ID}.{BQ_DATASET}.dim_fuel_type"
client.load_table_from_dataframe(df_fuel, table_ref, job_config=job_config).result()
print(f"Seeded dim_fuel_type: {len(df_fuel)} rows")

In [None]:
state_data = [
    {"state_code":"AL","state_name":"Alabama","census_region":"South","census_division":"East South Central","latitude":32.806671,"longitude":-86.791130,"population":5024279},
    {"state_code":"AK","state_name":"Alaska","census_region":"West","census_division":"Pacific","latitude":63.588753,"longitude":-154.493062,"population":733391},
    {"state_code":"AZ","state_name":"Arizona","census_region":"West","census_division":"Mountain","latitude":34.048927,"longitude":-111.093735,"population":7151502},
    {"state_code":"AR","state_name":"Arkansas","census_region":"South","census_division":"West South Central","latitude":35.201050,"longitude":-91.831833,"population":3011524},
    {"state_code":"CA","state_name":"California","census_region":"West","census_division":"Pacific","latitude":36.778261,"longitude":-119.417932,"population":39538223},
    {"state_code":"CO","state_name":"Colorado","census_region":"West","census_division":"Mountain","latitude":39.550051,"longitude":-105.782067,"population":5773714},
    {"state_code":"CT","state_name":"Connecticut","census_region":"Northeast","census_division":"New England","latitude":41.603221,"longitude":-73.087749,"population":3605944},
    {"state_code":"DE","state_name":"Delaware","census_region":"South","census_division":"South Atlantic","latitude":38.910832,"longitude":-75.527670,"population":989948},
    {"state_code":"FL","state_name":"Florida","census_region":"South","census_division":"South Atlantic","latitude":27.664827,"longitude":-81.515754,"population":21538187},
    {"state_code":"GA","state_name":"Georgia","census_region":"South","census_division":"South Atlantic","latitude":32.157435,"longitude":-82.907123,"population":10711908},
    {"state_code":"HI","state_name":"Hawaii","census_region":"West","census_division":"Pacific","latitude":19.898682,"longitude":-155.665857,"population":1455271},
    {"state_code":"ID","state_name":"Idaho","census_region":"West","census_division":"Mountain","latitude":44.068202,"longitude":-114.742041,"population":1839106},
    {"state_code":"IL","state_name":"Illinois","census_region":"Midwest","census_division":"East North Central","latitude":40.633125,"longitude":-89.398528,"population":12812508},
    {"state_code":"IN","state_name":"Indiana","census_region":"Midwest","census_division":"East North Central","latitude":40.551217,"longitude":-85.602364,"population":6785528},
    {"state_code":"IA","state_name":"Iowa","census_region":"Midwest","census_division":"West North Central","latitude":41.878003,"longitude":-93.097702,"population":3190369},
    {"state_code":"KS","state_name":"Kansas","census_region":"Midwest","census_division":"West North Central","latitude":39.011902,"longitude":-98.484246,"population":2937880},
    {"state_code":"KY","state_name":"Kentucky","census_region":"South","census_division":"East South Central","latitude":37.839333,"longitude":-84.270018,"population":4505836},
    {"state_code":"LA","state_name":"Louisiana","census_region":"South","census_division":"West South Central","latitude":31.244823,"longitude":-92.145024,"population":4657757},
    {"state_code":"ME","state_name":"Maine","census_region":"Northeast","census_division":"New England","latitude":45.253783,"longitude":-69.445469,"population":1362359},
    {"state_code":"MD","state_name":"Maryland","census_region":"South","census_division":"South Atlantic","latitude":39.045755,"longitude":-76.641271,"population":6177224},
    {"state_code":"MA","state_name":"Massachusetts","census_region":"Northeast","census_division":"New England","latitude":42.407211,"longitude":-71.382437,"population":7029917},
    {"state_code":"MI","state_name":"Michigan","census_region":"Midwest","census_division":"East North Central","latitude":44.314844,"longitude":-85.602364,"population":10077331},
    {"state_code":"MN","state_name":"Minnesota","census_region":"Midwest","census_division":"West North Central","latitude":46.729553,"longitude":-94.685900,"population":5706494},
    {"state_code":"MS","state_name":"Mississippi","census_region":"South","census_division":"East South Central","latitude":32.354668,"longitude":-89.398528,"population":2961279},
    {"state_code":"MO","state_name":"Missouri","census_region":"Midwest","census_division":"West North Central","latitude":37.964253,"longitude":-91.831833,"population":6154913},
    {"state_code":"MT","state_name":"Montana","census_region":"West","census_division":"Mountain","latitude":46.879682,"longitude":-110.362566,"population":1084225},
    {"state_code":"NE","state_name":"Nebraska","census_region":"Midwest","census_division":"West North Central","latitude":41.492537,"longitude":-99.901813,"population":1961504},
    {"state_code":"NV","state_name":"Nevada","census_region":"West","census_division":"Mountain","latitude":38.802610,"longitude":-116.419389,"population":3104614},
    {"state_code":"NH","state_name":"New Hampshire","census_region":"Northeast","census_division":"New England","latitude":43.193852,"longitude":-71.572395,"population":1377529},
    {"state_code":"NJ","state_name":"New Jersey","census_region":"Northeast","census_division":"Middle Atlantic","latitude":40.058324,"longitude":-74.405661,"population":9288994},
    {"state_code":"NM","state_name":"New Mexico","census_region":"West","census_division":"Mountain","latitude":34.519940,"longitude":-105.870090,"population":2117522},
    {"state_code":"NY","state_name":"New York","census_region":"Northeast","census_division":"Middle Atlantic","latitude":43.299428,"longitude":-74.217933,"population":20201249},
    {"state_code":"NC","state_name":"North Carolina","census_region":"South","census_division":"South Atlantic","latitude":35.759573,"longitude":-79.019300,"population":10439388},
    {"state_code":"ND","state_name":"North Dakota","census_region":"Midwest","census_division":"West North Central","latitude":47.551493,"longitude":-101.002012,"population":779094},
    {"state_code":"OH","state_name":"Ohio","census_region":"Midwest","census_division":"East North Central","latitude":40.417287,"longitude":-82.907123,"population":11799448},
    {"state_code":"OK","state_name":"Oklahoma","census_region":"South","census_division":"West South Central","latitude":35.007752,"longitude":-97.092877,"population":3959353},
    {"state_code":"OR","state_name":"Oregon","census_region":"West","census_division":"Pacific","latitude":43.804133,"longitude":-120.554201,"population":4237256},
    {"state_code":"PA","state_name":"Pennsylvania","census_region":"Northeast","census_division":"Middle Atlantic","latitude":41.203322,"longitude":-77.194525,"population":13002700},
    {"state_code":"RI","state_name":"Rhode Island","census_region":"Northeast","census_division":"New England","latitude":41.580095,"longitude":-71.477429,"population":1097379},
    {"state_code":"SC","state_name":"South Carolina","census_region":"South","census_division":"South Atlantic","latitude":33.836081,"longitude":-81.163725,"population":5118425},
    {"state_code":"SD","state_name":"South Dakota","census_region":"Midwest","census_division":"West North Central","latitude":43.969515,"longitude":-99.901813,"population":886667},
    {"state_code":"TN","state_name":"Tennessee","census_region":"South","census_division":"East South Central","latitude":35.517491,"longitude":-86.580447,"population":6910840},
    {"state_code":"TX","state_name":"Texas","census_region":"South","census_division":"West South Central","latitude":31.968599,"longitude":-99.901813,"population":29145505},
    {"state_code":"UT","state_name":"Utah","census_region":"West","census_division":"Mountain","latitude":39.320980,"longitude":-111.093731,"population":3271616},
    {"state_code":"VT","state_name":"Vermont","census_region":"Northeast","census_division":"New England","latitude":44.558803,"longitude":-72.577841,"population":643077},
    {"state_code":"VA","state_name":"Virginia","census_region":"South","census_division":"South Atlantic","latitude":37.431573,"longitude":-78.656894,"population":8631393},
    {"state_code":"WA","state_name":"Washington","census_region":"West","census_division":"Pacific","latitude":47.751074,"longitude":-120.740139,"population":7614893},
    {"state_code":"WV","state_name":"West Virginia","census_region":"South","census_division":"South Atlantic","latitude":38.597626,"longitude":-80.454903,"population":1793716},
    {"state_code":"WI","state_name":"Wisconsin","census_region":"Midwest","census_division":"East North Central","latitude":43.784440,"longitude":-88.787868,"population":5893718},
    {"state_code":"WY","state_name":"Wyoming","census_region":"West","census_division":"Mountain","latitude":43.074684,"longitude":-107.290284,"population":576851},
    {"state_code":"DC","state_name":"District of Columbia","census_region":"South","census_division":"South Atlantic","latitude":38.907192,"longitude":-77.036871,"population":689545},
]

df_geo = pd.DataFrame(state_data)
table_ref = f"{GCP_PROJECT_ID}.{BQ_DATASET}.dim_geography"
client.load_table_from_dataframe(df_geo, table_ref, job_config=job_config).result()
print(f"Seeded dim_geography: {len(df_geo)} rows")

In [None]:
import numpy as np

dates = pd.date_range("2010-01-01", "2027-12-31", freq="D")
df_date = pd.DataFrame({"date_key": dates})
df_date["year"]          = df_date["date_key"].dt.year
df_date["quarter"]       = df_date["date_key"].dt.quarter
df_date["month"]         = df_date["date_key"].dt.month
df_date["month_name"]    = df_date["date_key"].dt.strftime("%B")
df_date["day_of_week"]   = df_date["date_key"].dt.dayofweek   # 0=Mon
df_date["day_name"]      = df_date["date_key"].dt.strftime("%A")
df_date["is_weekend"]    = df_date["day_of_week"] >= 5
df_date["is_peak_month"] = df_date["month"].isin([6, 7, 8, 12, 1, 2])  # Summer + Winter peaks

table_ref = f"{GCP_PROJECT_ID}.{BQ_DATASET}.dim_date"
client.load_table_from_dataframe(df_date, table_ref, job_config=job_config).result()
print(f"Seeded dim_date: {len(df_date)} rows")

In [None]:
query = f"""
SELECT table_id, row_count, size_bytes
FROM `{GCP_PROJECT_ID}.{BQ_DATASET}.__TABLES__`
ORDER BY table_id
"""
df_verify = client.query(query).to_dataframe()
print("\n=== BigQuery Tables ===")
print(df_verify.to_string(index=False))
print("\nSchema setup complete!")