# Normalized data inspection (local)

This notebook inspects how `data/dataset.json` is normalized into an in-memory `MenuIndex`.

It uses:
- `src.bootstrap.load_index()`
- `src.inspect` view builders (`items_rows`, `prices_rows`, `categories_rows`, `discounts_rows`, `summary`)
- Optional pandas DataFrames (`items_df`, etc.) if `pandas` is installed.


In [1]:
# Setup (local): ensure paths resolve correctly
import os
import sys
from pathlib import Path
import pandas as pd


def _find_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(6):
        if (cur / "pyproject.toml").exists() and (cur / "src").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()


repo_root = _find_repo_root(Path.cwd())

# Make imports work even if you started Jupyter from outside the repo.
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

# Make relative paths (like data/...) resolve from repo root.
os.chdir(repo_root)

# If your dataset lives elsewhere, change this one line.
dataset_path = repo_root / "data" / "dataset.json"

if not dataset_path.exists():
    raise FileNotFoundError(
        f"Expected dataset at {dataset_path}. "
        "If your dataset is in a different location, update dataset_path in this cell."
    )

print("Repo root:", repo_root)
print("CWD:", Path.cwd())
print("Dataset:", dataset_path)


Repo root: /Users/andrestroiano/code/personal/qu-challenge
CWD: /Users/andrestroiano/code/personal/qu-challenge
Dataset: /Users/andrestroiano/code/personal/qu-challenge/data/dataset.json


In [2]:
# Quick sanity checks
repo_root, dataset_path

(PosixPath('/Users/andrestroiano/code/personal/qu-challenge'),
 PosixPath('/Users/andrestroiano/code/personal/qu-challenge/data/dataset.json'))

In [3]:
# Load the in-memory index
from src.bootstrap import load_index

# Use the absolute path computed in the setup cell.
idx = load_index(str(dataset_path))

len(idx.items), len(idx.categories), len(idx.discounts)


(46, 7, 9)

In [4]:
# Summary coverage stats
from src.inspect import summary

summary(idx)


{'num_items': 46,
 'num_categories': 7,
 'num_discounts': 9,
 'items_with_prices': 46,
 'items_with_portions': 15,
 'calories_structured': 3,
 'calories_parsed': 5,
 'calories_missing_or_null': 38}

In [None]:
from src import inspect as insp

items = insp.items_df(idx)
prices = insp.prices_df(idx)
cats = insp.categories_df(idx)
discs = insp.discounts_df(idx)


In [11]:
items.head(3)

Unnamed: 0,item_id,name,title,category_path,category_leaf,item_path_key,num_prices,has_portions,portions,min_price,max_price,calories,calories_source,num_applicable_discounts,applicable_discount_ids,has_description
0,79885,DESSERT BOWL,Bowls - DESSERT BOWL,Bowls,Bowls,73915-73970-79885,1,False,,9.49,9.49,,missing,0,,False
1,73980,DRAGON BOWL,Bowls - DRAGON BOWL,Bowls,Bowls,73915-73970-73980,2,True,"Large, Medium",14.49,15.99,650.0,structured,2,"77374, 79380",True
2,74100,GREEN BOWL,Bowls - GREEN BOWL,Bowls,Bowls,73915-73970-74100,2,True,"Large, Medium",14.49,15.99,,missing,2,"77374, 79380",True


In [12]:
prices.head(3)

Unnamed: 0,item_id,name,portion,price,category_path,item_path_key
0,74246,ACAI ELIXIR,,8.49,Smoothies,73915-73971-74246
1,74526,Bottle Ginger Energy Shot,,4.0,Cold Pressed Bottles,73915-73974-74526
2,79766,C ME UP,Large,9.79,Fresh Juices,73915-73972-79766


In [13]:
cats.head(3)

Unnamed: 0,category_id,title,category_path,leaf,item_count_by_leaf
0,73970,Bowls,Bowls,Bowls,7
1,73974,Cold Pressed Bottles,Cold Pressed Bottles,Cold Pressed Bottles,7
2,73972,Fresh Juices,Fresh Juices,Fresh Juices,9


In [14]:
discs.head(3)

Unnamed: 0,discount_id,name,raw_keys,has_coupon_hint
0,79380,$1 OFF any Bowl or Smoothie,"amount, applicationOrder, applyToBasePriceOnly...",False
1,77374,$3.00 Off LG Bowl,"amount, applicationOrder, awardItemsType, cate...",True
2,79370,2 SM Bowls for $20,"amount, applicationOrder, applyToBasePriceOnly...",False


In [15]:
# Optional: write CSV/JSONL inspection artifacts to out/
# (This writes files; skip if you only want in-memory inspection.)

# from src.export import export_all
# export_all(inp=str(dataset_path), out_dir=str(repo_root / "out"))
# print("Wrote inspection artifacts to:", repo_root / "out")


## Notes

- The normalized data lives in memory as Pydantic models under `idx.items`, `idx.categories`, and `idx.discounts`.
- The exporter (`python -m src.export ...`) writes *flattened views* only (no large raw blobs).


In [19]:
items.head(3).to_json(orient="records")

'[{"item_id":79885,"name":"DESSERT BOWL","title":"Bowls - DESSERT BOWL","category_path":"Bowls","category_leaf":"Bowls","item_path_key":"73915-73970-79885","num_prices":1,"has_portions":false,"portions":"","min_price":9.49,"max_price":9.49,"calories":null,"calories_source":"missing","num_applicable_discounts":0,"applicable_discount_ids":"","has_description":false},{"item_id":73980,"name":"DRAGON BOWL","title":"Bowls - DRAGON BOWL","category_path":"Bowls","category_leaf":"Bowls","item_path_key":"73915-73970-73980","num_prices":2,"has_portions":true,"portions":"Large, Medium","min_price":14.49,"max_price":15.99,"calories":650.0,"calories_source":"structured","num_applicable_discounts":2,"applicable_discount_ids":"77374, 79380","has_description":true},{"item_id":74100,"name":"GREEN BOWL","title":"Bowls - GREEN BOWL","category_path":"Bowls","category_leaf":"Bowls","item_path_key":"73915-73970-74100","num_prices":2,"has_portions":true,"portions":"Large, Medium","min_price":14.49,"max_price"

In [20]:
prices.head(3).to_json(orient="records")

'[{"item_id":74246,"name":"ACAI ELIXIR","portion":null,"price":8.49,"category_path":"Smoothies","item_path_key":"73915-73971-74246"},{"item_id":74526,"name":"Bottle Ginger Energy Shot","portion":null,"price":4.0,"category_path":"Cold Pressed Bottles","item_path_key":"73915-73974-74526"},{"item_id":79766,"name":"C ME UP","portion":"Large","price":9.79,"category_path":"Fresh Juices","item_path_key":"73915-73972-79766"}]'

In [21]:
cats.head(3).to_json(orient="records")

'[{"category_id":73970,"title":"Bowls","category_path":"Bowls","leaf":"Bowls","item_count_by_leaf":7},{"category_id":73974,"title":"Cold Pressed Bottles","category_path":"Cold Pressed Bottles","leaf":"Cold Pressed Bottles","item_count_by_leaf":7},{"category_id":73972,"title":"Fresh Juices","category_path":"Fresh Juices","leaf":"Fresh Juices","item_count_by_leaf":9}]'

In [22]:
discs.head(3).to_json(orient="records")

'[{"discount_id":79380,"name":"$1 OFF any Bowl or Smoothie","raw_keys":"amount, applicationOrder, applyToBasePriceOnly, awardItemsType, categoryId, checkTitle, companyId, deleted, displayOrder, effectivity, hasItems, id, isItemDiscount, maximumUsages, targetItems, triggerItems, typeId","has_coupon_hint":false},{"discount_id":77374,"name":"$3.00 Off LG Bowl","raw_keys":"amount, applicationOrder, awardItemsType, categoryId, checkTitle, companyId, couponCode, deleted, displayOrder, effectivity, hasItems, id, isItemDiscount, maximumUsages, targetItems, triggerItems, typeId","has_coupon_hint":true},{"discount_id":79370,"name":"2 SM Bowls for $20","raw_keys":"amount, applicationOrder, applyToBasePriceOnly, autoApply, awardItemsType, categoryId, checkTitle, companyId, deleted, displayOrder, effectivity, hasItems, id, isItemDiscount, targetItems, triggerItems, typeId","has_coupon_hint":false}]'