Notebook 0 — Data Sanity & Project Setup
Purpose

Confirm BigQuery connectivity and credentials.

Inspect table schemas & nulls.

Establish true date bounds of the dataset.

Identify valid (shop_id, item_id) pairs and their coverage.

In [1]:
# --- Cell 1: imports & runtime info ---
from pathlib import Path
import os, sys
import pandas as pd

from google.cloud import bigquery
from google.oauth2 import service_account

print("Python:", sys.executable)
print("CWD   :", Path.cwd())

Python: /home/btheard/retail-alpha-forecaster/.venv/bin/python
CWD   : /home/btheard/retail-alpha-forecaster/notebooks


In [2]:
# --- Cell 2: config (edit only if names differ) ---

PROJECT = "retail-alpha-forecaster"
DATASET = "raf"
FC_TABLE = f"`{PROJECT}.{DATASET}.forecasts`"     # model outputs (from NB 3)
BK_TABLE = f"`{PROJECT}.{DATASET}.backtests`"    # backtest outputs (from NB 4)

# Try to resolve SA key in a notebook-safe way.
# Adjust the filename if yours differs.
CANDIDATE_PATHS = [
    Path.cwd() / "keys" / "retail-alpha-forecaster-7f14a7b50e62.json",
    Path.cwd().parents[0] / "keys" / "retail-alpha-forecaster-7f14a7b50e62.json",
    Path.home() / "retail-alpha-forecaster" / "keys" / "retail-alpha-forecaster-7f14a7b50e62.json",
]

KEY_PATH = next((p for p in CANDIDATE_PATHS if p.exists()), None)

# If you prefer env var style, set GOOGLE_APPLICATION_CREDENTIALS and skip KEY_PATH logic.
ENV_CRED = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

if KEY_PATH:
    print("Using key file:", KEY_PATH)
    creds = service_account.Credentials.from_service_account_file(str(KEY_PATH))
elif ENV_CRED and Path(ENV_CRED).exists():
    print("Using env GOOGLE_APPLICATION_CREDENTIALS:", ENV_CRED)
    creds = service_account.Credentials.from_service_account_file(ENV_CRED)
else:
    raise FileNotFoundError(
        "Service account key not found. Put it in ./keys/ or set GOOGLE_APPLICATION_CREDENTIALS."
    )

# Create BigQuery client (set your location if needed)
client = bigquery.Client(project=PROJECT, credentials=creds, location="US")
client.query("SELECT 1").result()
print("✅ BigQuery client ready")

Using key file: /home/btheard/retail-alpha-forecaster/keys/retail-alpha-forecaster-7f14a7b50e62.json
✅ BigQuery client ready


In [3]:
# --- Cell 3: query helper ---
def q(sql: str) -> pd.DataFrame:
    """Run a SQL statement and return a pandas DataFrame."""
    job = client.query(sql)
    return job.result().to_dataframe()

In [4]:
# --- Cell 4: health checks ---
print("Table:", FC_TABLE)

df_meta = q(f"""
SELECT
  COUNT(*)                         AS n_rows,
  MIN(date)                        AS min_date,
  MAX(date)                        AS max_date,
  COUNTIF(shop_id IS NULL)         AS null_shop_rows,
  COUNTIF(item_id IS NULL)         AS null_item_rows
FROM {FC_TABLE}
""")
df_meta



Table: `retail-alpha-forecaster.raf.forecasts`


Unnamed: 0,n_rows,min_date,max_date,null_shop_rows,null_item_rows
0,25374,2015-10-01,2015-10-31,0,0


In [5]:
# --- Cell 5: coverage summary (non-null shop & item) ---
q(f"""
SELECT
  COUNT(*)                                                 AS n_rows,
  COUNTIF(shop_id IS NULL)                                 AS null_shop_rows,
  COUNTIF(item_id IS NULL)                                 AS null_item_rows,
  MIN(date)                                                AS min_date,
  MAX(date)                                                AS max_date,
  COUNT(DISTINCT shop_id)                                  AS n_shops,
  COUNT(DISTINCT item_id)                                  AS n_items,
  COUNT(DISTINCT CONCAT(CAST(shop_id AS STRING),':',CAST(item_id AS STRING))) AS n_pairs
FROM {FC_TABLE}
WHERE shop_id IS NOT NULL AND item_id IS NOT NULL
""")



Unnamed: 0,n_rows,null_shop_rows,null_item_rows,min_date,max_date,n_shops,n_items,n_pairs
0,25374,0,0,2015-10-01,2015-10-31,41,1374,4996


In [6]:
# --- Cell 6: pairs & windows (drives app dropdowns & date-pickers) ---

pairs = q(f"""
SELECT shop_id, item_id, COUNT(*) AS n_rows
FROM {FC_TABLE}
WHERE shop_id IS NOT NULL AND item_id IS NOT NULL
GROUP BY shop_id, item_id
ORDER BY n_rows DESC
""")
pairs.head(10)

Unnamed: 0,shop_id,item_id,n_rows
0,25,20949,62
1,31,20949,62
2,42,20949,62
3,26,20949,60
4,28,20949,60
5,47,20949,60
6,57,20949,60
7,21,20949,58
8,39,20949,58
9,55,13097,58


In [7]:
# --- Cell 6b: min/max window per pair ---
pair_windows = q(f"""
SELECT
  shop_id,
  item_id,
  MIN(date) AS min_date,
  MAX(date) AS max_date,
  COUNT(*)  AS n_rows
FROM {FC_TABLE}
WHERE shop_id IS NOT NULL AND item_id IS NOT NULL
GROUP BY shop_id, item_id
ORDER BY n_rows DESC
""")
pair_windows.head(10)


Unnamed: 0,shop_id,item_id,min_date,max_date,n_rows
0,25,20949,2015-10-01,2015-10-31,62
1,31,20949,2015-10-01,2015-10-31,62
2,42,20949,2015-10-01,2015-10-31,62
3,26,20949,2015-10-01,2015-10-31,60
4,28,20949,2015-10-01,2015-10-31,60
5,47,20949,2015-10-01,2015-10-31,60
6,57,20949,2015-10-01,2015-10-31,60
7,21,20949,2015-10-01,2015-10-31,58
8,39,20949,2015-10-01,2015-10-31,58
9,55,13097,2015-10-02,2015-10-31,58


In [8]:
# --- Cell 7: sanity checks on yhat & bands ---
q(f"""
SELECT
  COUNTIF(yhat IS NULL)                         AS null_yhat,
  COUNTIF(yhat < 0)                             AS neg_yhat,
  COUNTIF(yhat_lower > yhat_upper)              AS swapped_bands,
  COUNT(DISTINCT DATE(date))                    AS n_days
FROM {FC_TABLE}
WHERE shop_id IS NOT NULL AND item_id IS NOT NULL
""")


Unnamed: 0,null_yhat,neg_yhat,swapped_bands,n_days
0,0,4,0,31


In [9]:
# --- Cell 7b: check duplicate keys (date, shop_id, item_id) ---
dupes = q(f"""
SELECT date, shop_id, item_id, COUNT(*) AS cnt
FROM {FC_TABLE}
WHERE shop_id IS NOT NULL AND item_id IS NOT NULL
GROUP BY date, shop_id, item_id
HAVING COUNT(*) > 1
ORDER BY cnt DESC, date
""")
dupes.head(20), len(dupes)


(          date  shop_id  item_id  cnt
 0   2015-10-01        2     7894    2
 1   2015-10-01        2    17717    2
 2   2015-10-01        3     5671    2
 3   2015-10-01        3     6738    2
 4   2015-10-01        3     6740    2
 5   2015-10-01        3    17717    2
 6   2015-10-01        4     3731    2
 7   2015-10-01        4     7736    2
 8   2015-10-01        4    17717    2
 9   2015-10-01        4    20949    2
 10  2015-10-01        5     3733    2
 11  2015-10-01        5     4351    2
 12  2015-10-01        5     7018    2
 13  2015-10-01        5     7791    2
 14  2015-10-01        5    14227    2
 15  2015-10-01        5    15045    2
 16  2015-10-01        5    16287    2
 17  2015-10-01        5    20949    2
 18  2015-10-01        6       31    2
 19  2015-10-01        6     4181    2,
 12687)

In [10]:
# --- Cell 8: create a clean view in BigQuery (idempotent) ---

sql = f"""
CREATE OR REPLACE VIEW `{PROJECT}.{DATASET}.v_forecasts_clean` AS
SELECT
  date,
  shop_id,
  item_id,
  model,
  yhat,
  LEAST(yhat_lower, yhat_upper)   AS yhat_lower,   -- fix swapped bands
  GREATEST(yhat_lower, yhat_upper) AS yhat_upper,
  created_at
FROM (
  SELECT
    *,
    ROW_NUMBER() OVER (
      PARTITION BY date, shop_id, item_id
      ORDER BY
        CASE WHEN model = 'lightgbm' THEN 0 ELSE 1 END,
        created_at DESC
    ) AS _rn
  FROM {FC_TABLE}
  WHERE shop_id IS NOT NULL AND item_id IS NOT NULL
)
WHERE _rn = 1
"""
client.query(sql).result()
print("✅ Created/updated view:", f"`{PROJECT}.{DATASET}.v_forecasts_clean`")




✅ Created/updated view: `retail-alpha-forecaster.raf.v_forecasts_clean`


In [11]:
# --- Cell 9: pair & list helper views (for Streamlit dropdowns quickly) ---

sql_pairs = f"""
CREATE OR REPLACE VIEW `{PROJECT}.{DATASET}.v_forecast_pairs` AS
SELECT
  shop_id,
  item_id,
  MIN(date) AS min_date,
  MAX(date) AS max_date,
  COUNT(*)  AS n_rows
FROM `{PROJECT}.{DATASET}.v_forecasts_clean`
GROUP BY shop_id, item_id
"""
client.query(sql_pairs).result()

sql_lists = f"""
CREATE OR REPLACE VIEW `{PROJECT}.{DATASET}.v_shops_items` AS
WITH s AS (SELECT DISTINCT shop_id FROM `{PROJECT}.{DATASET}.v_forecasts_clean`),
     i AS (SELECT DISTINCT item_id FROM `{PROJECT}.{DATASET}.v_forecasts_clean`)
SELECT * FROM s, i
"""
client.query(sql_lists).result()

print("✅ Created/updated views:",
      f"`{PROJECT}.{DATASET}.v_forecast_pairs`",
      "and",
      f"`{PROJECT}.{DATASET}.v_shops_items`")


✅ Created/updated views: `retail-alpha-forecaster.raf.v_forecast_pairs` and `retail-alpha-forecaster.raf.v_shops_items`


In [12]:
# --- Cell 10: preview one pair (edit IDs and re-run) ---
SHOP, ITEM = 25, 20949   # <-- change to any pair from 'pairs' output above

preview = q(f"""
SELECT date, yhat, yhat_lower, yhat_upper, model, created_at
FROM `{PROJECT}.{DATASET}.v_forecasts_clean`
WHERE shop_id = {SHOP} AND item_id = {ITEM}
ORDER BY date
""")
preview.head(20)


Unnamed: 0,date,yhat,yhat_lower,yhat_upper,model,created_at
0,2015-10-01,11.32983,7.72822,14.93144,lightgbm,2025-08-17 16:15:26.225742+00:00
1,2015-10-02,15.140896,11.539286,18.742506,lightgbm,2025-08-17 16:15:26.225742+00:00
2,2015-10-03,13.465129,9.863519,17.066739,lightgbm,2025-08-17 16:15:26.225742+00:00
3,2015-10-04,15.485568,11.883958,19.087178,lightgbm,2025-08-17 16:15:26.225742+00:00
4,2015-10-05,10.192873,6.591263,13.794483,lightgbm,2025-08-17 16:15:26.225742+00:00
5,2015-10-06,12.013605,8.411995,15.615215,lightgbm,2025-08-17 16:15:26.225742+00:00
6,2015-10-07,9.74395,6.14234,13.34556,lightgbm,2025-08-17 16:15:26.225742+00:00
7,2015-10-08,10.896068,7.294458,14.497678,lightgbm,2025-08-17 16:15:26.225742+00:00
8,2015-10-09,14.232312,10.630702,17.833922,lightgbm,2025-08-17 16:15:26.225742+00:00
9,2015-10-10,21.342919,17.741309,24.944529,lightgbm,2025-08-17 16:15:26.225742+00:00


In [13]:
# --- Cell 11: (optional) cache useful tables locally for faster iteration ---
pairs.to_csv("pairs.csv", index=False)
pair_windows.to_csv("pair_windows.csv", index=False)

shops = q(f"SELECT DISTINCT shop_id FROM `{PROJECT}.{DATASET}.v_forecasts_clean` ORDER BY shop_id")
items = q(f"SELECT DISTINCT item_id FROM `{PROJECT}.{DATASET}.v_forecasts_clean` ORDER BY item_id")
shops.to_csv("shops.csv", index=False)
items.to_csv("items.csv", index=False)

print("Saved: pairs.csv, pair_windows.csv, shops.csv, items.csv")


Saved: pairs.csv, pair_windows.csv, shops.csv, items.csv
