03 — Ingest Monthly Generation by Fuel Type
=============================================
Lumina Forecasting Hub

Pulls monthly electricity generation data from EIA API v2:
  - electricity/electric-power-operational-data → generation by state, fuel, sector

This feeds Dashboard Page 2: Energy Mix & Transition Tracker
  - Stacked area of generation by fuel over time
  - Renewable share % calculation
  - Small multiples by state/region

Supports incremental loads — checks max period in BQ and only pulls newer data.

Usage in Colab:
  1. Run 01_setup_bigquery_schema.py first
  2. Set your EIA_API_KEY and GCP_PROJECT_ID below
  3. Run all cells

In [None]:
from google.colab import auth
auth.authenticate_user()

import requests
import pandas as pd
import numpy as np
import time
from google.cloud import bigquery

# ── Config ──────────────────────────────────────────────────────────
EIA_API_KEY    = "YOUR_EIA_API_KEY"       # <-- UPDATE
GCP_PROJECT_ID = "YOUR_GCP_PROJECT_ID"    # <-- UPDATE
BQ_DATASET     = "lumina"
EIA_BASE_URL   = "https://api.eia.gov/v2"

# Fuel types we care about (excludes aggregate codes like ALL, AOR)
TARGET_FUELS = [
    "SUN", "WND", "WAT", "NUC", "NG", "COL", "PET",
    "GEO", "BIO", "WAS", "OOG", "SPV", "STH", "DPV",
    "HYC", "HPS", "WWW",
]

# We pull all 50 states + DC
US_STATES = [
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
    "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
    "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
    "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
    "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC",
]

BACKFILL_START = "2019-01"  # Monthly format

client = bigquery.Client(project=GCP_PROJECT_ID)
print(f"Connected to BigQuery: {GCP_PROJECT_ID}")

In [None]:
def fetch_generation_data(state_code, fuel_code, start, end=None):
    """
    Fetch monthly generation (MWh) for a state/fuel pair.
    Uses electricity/electric-power-operational-data endpoint.
    
    Returns pd.DataFrame with [period, state, fuel, sector, generation_mwh]
    """
    route = f"{EIA_BASE_URL}/electricity/electric-power-operational-data/data/"
    
    all_records = []
    offset = 0
    page_size = 5000
    
    while True:
        params = {
            "api_key": EIA_API_KEY,
            "frequency": "monthly",
            "data[0]": "generation",
            "facets[location][]": state_code,
            "facets[fueltypeid][]": fuel_code,
            "sort[0][column]": "period",
            "sort[0][direction]": "asc",
            "offset": offset,
            "length": page_size,
        }
        if start:
            params["start"] = start
        if end:
            params["end"] = end
        
        resp = requests.get(route, params=params)
        resp.raise_for_status()
        body = resp.json()
        
        data = body.get("response", {}).get("data", [])
        total = int(body.get("response", {}).get("total", 0))
        
        if not data:
            break
        
        all_records.extend(data)
        offset += page_size
        
        if offset >= total:
            break
        
        time.sleep(0.2)
    
    return pd.DataFrame(all_records) if all_records else pd.DataFrame()

In [None]:
def get_max_period_generation():
    """Returns max period_month in fact_monthly_generation as 'YYYY-MM' string."""
    query = f"""
    SELECT FORMAT_DATE('%Y-%m', MAX(period_month)) AS max_period
    FROM `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_monthly_generation`
    """
    try:
        result = client.query(query).to_dataframe()
        val = result["max_period"].iloc[0]
        if pd.isna(val):
            return BACKFILL_START
        return val
    except Exception:
        return BACKFILL_START

In [None]:
def fetch_all_generation(mode="incremental"):
    """
    Fetch generation data for all states and fuel types.
    Includes retry logic for flaky EIA API (502/503 errors).
    """
    if mode == "incremental":
        start = get_max_period_generation()
    else:
        start = BACKFILL_START
    
    print(f"Fetching generation data from {start} onward...")
    
    all_dfs = []
    failed_fuels = []
    
    for fuel in TARGET_FUELS:
        route = f"{EIA_BASE_URL}/electricity/electric-power-operational-data/data/"
        offset = 0
        page_size = 5000
        fuel_records = []
        fuel_failed = False
        
        while True:
            params = {
                "api_key": EIA_API_KEY,
                "frequency": "monthly",
                "data[0]": "generation",
                "facets[fueltypeid][]": fuel,
                "sort[0][column]": "period",
                "sort[0][direction]": "asc",
                "offset": offset,
                "length": page_size,
                "start": start,
            }
            
            # Retry with exponential backoff
            resp = None
            max_retries = 5
            for attempt in range(max_retries):
                try:
                    resp = requests.get(route, params=params, timeout=60)
                    resp.raise_for_status()
                    break
                except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
                    if attempt < max_retries - 1:
                        wait = 2 ** attempt * 5
                        print(f"  [{fuel}] API error ({e}), retry in {wait}s ({attempt+1}/{max_retries})")
                        time.sleep(wait)
                    else:
                        print(f"  [{fuel}] FAILED after {max_retries} retries: {e}")
                        failed_fuels.append(fuel)
                        fuel_failed = True
            
            if fuel_failed or resp is None:
                break
            
            # Process the successful response
            body = resp.json()
            data = body.get("response", {}).get("data", [])
            total = int(body.get("response", {}).get("total", 0))
            
            if not data:
                break
            
            fuel_records.extend(data)
            offset += page_size
            
            print(f"  [{fuel}] {len(fuel_records)}/{total}", end="\r")
            
            if offset >= total:
                break
            
            time.sleep(0.3)
        
        if fuel_records:
            df = pd.DataFrame(fuel_records)
            all_dfs.append(df)
            print(f"  [{fuel}] {len(fuel_records)} records fetched")
        elif not fuel_failed:
            print(f"  [{fuel}] no data")
    
    if failed_fuels:
        print(f"\n⚠ Failed fuels (API errors): {failed_fuels}")
        print("  Data for other fuels was still saved. Re-run to retry failed fuels.")
    
    if not all_dfs:
        return pd.DataFrame()
    
    return pd.concat(all_dfs, ignore_index=True)


In [None]:
MODE = "backfill"  # Change to "incremental" after first run

raw = fetch_all_generation(mode=MODE)

if raw.empty:
    print("No data fetched. Check API key and date range.")
else:
    print(f"\nRaw records: {len(raw):,}")
    print(f"Columns: {list(raw.columns)}")
    
    # ── Transform ────────────────────────────────────────────────────
    # Map API fields to our schema
    df = raw.copy()
    
    # Filter to US states only (API may return US-total, regions, etc.)
    if "location" in df.columns:
        df = df[df["location"].isin(US_STATES)].copy()
    
    # Parse period to date (first day of month)
    df["period_month"] = pd.to_datetime(df["period"] + "-01", format="%Y-%m-%d", errors="coerce")
    
    # Rename and type-cast
    col_map = {
        "location": "state_code",
        "fueltypeid": "fuel_code",
        "sectorid": "sector_code",
        "generation": "generation_mwh",
    }
    df = df.rename(columns={k: v for k, v in col_map.items() if k in df.columns})
    
    if "generation_mwh" in df.columns:
        df["generation_mwh"] = pd.to_numeric(df["generation_mwh"], errors="coerce")
    
    # Select final columns
    final_cols = ["period_month", "state_code", "fuel_code", "sector_code", "generation_mwh"]
    for col in final_cols:
        if col not in df.columns:
            df[col] = None
    
    result = df[final_cols].dropna(subset=["period_month"]).copy()
    result = result.sort_values(["period_month", "state_code", "fuel_code"]).reset_index(drop=True)
    
    print(f"Cleaned records: {len(result):,}")
    print(f"Date range: {result['period_month'].min()} → {result['period_month'].max()}")
    print(f"States: {result['state_code'].nunique()}")
    print(f"Fuels: {result['fuel_code'].nunique()}")
    
    # ── Load to BigQuery ─────────────────────────────────────────────
    table_ref = f"{GCP_PROJECT_ID}.{BQ_DATASET}.fact_monthly_generation"
    
    if MODE == "backfill":
        write_mode = "WRITE_TRUNCATE"
    else:
        write_mode = "WRITE_APPEND"
    
    job_config = bigquery.LoadJobConfig(write_disposition=write_mode)
    job = client.load_table_from_dataframe(result, table_ref, job_config=job_config)
    job.result()
    
    print(f"\nLoaded {len(result):,} rows to {table_ref} (mode={write_mode})")

In [None]:
quality_query = f"""
WITH fuel_stats AS (
    SELECT
        g.fuel_code,
        f.fuel_label,
        f.is_renewable,
        COUNT(*) AS row_count,
        MIN(g.period_month) AS earliest,
        MAX(g.period_month) AS latest,
        ROUND(SUM(g.generation_mwh) / 1e6, 1) AS total_twh,
        COUNTIF(g.generation_mwh IS NULL) AS null_count
    FROM `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_monthly_generation` g
    LEFT JOIN `{GCP_PROJECT_ID}.{BQ_DATASET}.dim_fuel_type` f
        ON g.fuel_code = f.fuel_code
    GROUP BY g.fuel_code, f.fuel_label, f.is_renewable
)
SELECT * FROM fuel_stats ORDER BY total_twh DESC
"""

df_quality = client.query(quality_query).to_dataframe()
print("\n=== Data Quality Report: fact_monthly_generation ===")
print(df_quality.to_string(index=False))

In [None]:
import matplotlib.pyplot as plt

viz_query = f"""
SELECT
    g.period_month,
    f.fuel_label,
    f.is_renewable,
    SUM(g.generation_mwh) / 1e6 AS generation_twh
FROM `{GCP_PROJECT_ID}.{BQ_DATASET}.fact_monthly_generation` g
JOIN `{GCP_PROJECT_ID}.{BQ_DATASET}.dim_fuel_type` f
    ON g.fuel_code = f.fuel_code
WHERE g.sector_code = '99'  -- All sectors / total
    AND f.fuel_label NOT IN ('All Fuels', 'All Renewables', 'All Solar')
GROUP BY g.period_month, f.fuel_label, f.is_renewable
ORDER BY g.period_month, generation_twh DESC
"""

df_viz = client.query(viz_query).to_dataframe()

if not df_viz.empty:
    # Pivot for stacked area
    pivot = df_viz.pivot_table(
        index="period_month",
        columns="fuel_label",
        values="generation_twh",
        aggfunc="sum"
    ).fillna(0)
    
    # Order columns by total generation
    col_order = pivot.sum().sort_values(ascending=False).index
    pivot = pivot[col_order]
    
    # Color mapping
    colors = {
        "Natural Gas": "#4393c3",
        "Coal": "#404040",
        "Nuclear": "#9970ab",
        "Wind": "#66c2a5",
        "Hydro": "#3288bd",
        "Solar PV": "#fee08b",
        "Solar": "#fee08b",
        "Conventional Hydro": "#3288bd",
        "Biomass": "#a6d96a",
        "Petroleum": "#d73027",
        "Geothermal": "#f46d43",
    }
    plot_colors = [colors.get(c, "#999999") for c in pivot.columns]
    
    fig, ax = plt.subplots(figsize=(16, 8))
    pivot.plot.area(ax=ax, stacked=True, color=plot_colors, alpha=0.85, linewidth=0.5)
    ax.set_ylabel("Generation (TWh)")
    ax.set_xlabel("")
    ax.set_title("US Monthly Electricity Generation by Fuel Type", fontsize=14)
    ax.legend(loc="upper left", ncol=3, fontsize=8)
    ax.grid(True, alpha=0.2)
    plt.tight_layout()
    plt.show()
    
    # Renewable share trend
    df_viz["gen_type"] = df_viz["is_renewable"].map({True: "Renewable", False: "Non-Renewable"})
    monthly_share = df_viz.groupby(["period_month", "gen_type"])["generation_twh"].sum().unstack(fill_value=0)
    monthly_share["renewable_pct"] = monthly_share["Renewable"] / (monthly_share["Renewable"] + monthly_share["Non-Renewable"]) * 100
    
    fig, ax = plt.subplots(figsize=(14, 4))
    ax.plot(monthly_share.index, monthly_share["renewable_pct"], color="#2ca02c", linewidth=2)
    ax.fill_between(monthly_share.index, monthly_share["renewable_pct"], alpha=0.2, color="#2ca02c")
    ax.set_ylabel("Renewable Share (%)")
    ax.set_title("National Renewable Generation Share Over Time")
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Latest renewable share: {monthly_share['renewable_pct'].iloc[-1]:.1f}%")
else:
    print("No data available for visualization yet.")