In [122]:
# setting up the project root on sys.path
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("Project root on sys.path:", PROJECT_ROOT)
print("Notebook cwd:", Path.cwd())
print("Root children:", [p.name for p in PROJECT_ROOT.iterdir() if p.is_dir()])


Project root on sys.path: /home/alonbenach/project/invoice-analysis
Notebook cwd: /home/alonbenach/project/invoice-analysis/notebooks
Root children: ['.git', 'outputs', 'data', 'src', 'config', 'outputs_large', 'notebooks', 'balagan']


ETL

In [123]:
# imports
from pathlib import Path
import pandas as pd
import yaml

from src.io_utils import list_csvs, read_csv, write_parquet, ensure_dir
from src.clean_utils import normalize_columns, parse_timestamp, assign_slots, cast_basic_types, basic_checks
from src.viz_utils import save_bar, save_hist, save_box

from src.fc_map_utils import map_fc_products, normalize_text

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [124]:
# 1 Paths
# IMPORTANT: use project-root-relative paths
DATA_DIR = PROJECT_ROOT / "data" / "invoices"
OUT_DIR  = PROJECT_ROOT / "outputs_large" / "audit"
PLOTS    = OUT_DIR / "plots"
CFG_SLOTS = PROJECT_ROOT / "config" / "slots.yaml"
CFG_FC_TH = PROJECT_ROOT / "config" / "fc_mapping_threshold.yaml"

ensure_dir(OUT_DIR); ensure_dir(PLOTS)
pd.options.display.max_columns = 200


In [125]:
# 2 Enumerate shards & quick sanity
csvs = list_csvs(DATA_DIR)
print(f"Found {len(csvs)} CSV files")
pd.Series([p.name for p in csvs]).to_csv(OUT_DIR/"_file_list.csv", index=False, header=["file"])


Found 13 CSV files


In [126]:
# 3 Per-file schema snapshot
rows = []
for p in csvs:
    df = read_csv(p)
    df = normalize_columns(df)
    rows.append({
        "file": p.name,
        "n_rows": len(df),
        "n_cols": len(df.columns),
        "columns": "|".join(df.columns),
    })
schema_overview = pd.DataFrame(rows)
schema_overview.to_csv(OUT_DIR/"schema_overview.csv", index=False)
schema_overview.head()


Unnamed: 0,file,n_rows,n_cols,columns
0,1-10000.csv,10000,14,receipt_id|receipt_number|purchase_date|purcha...
1,100001-110000.csv,10000,14,receipt_id|receipt_number|purchase_date|purcha...
2,10001-20000.csv,10000,14,receipt_id|receipt_number|purchase_date|purcha...
3,110001-120000.csv,10000,14,receipt_id|receipt_number|purchase_date|purcha...
4,120001-123743.csv,3743,14,receipt_id|receipt_number|purchase_date|purcha...


In [127]:
# 4 Schema parity check (simple)
# Expectation: same ordered set of columns across shards
cols_sets = schema_overview["columns"].unique()
parity_ok = (len(cols_sets) == 1)
pd.DataFrame({
    "schema_unique_count":[len(cols_sets)],
    "parity_ok":[parity_ok]
}).to_csv(OUT_DIR/"schema_parity_report.csv", index=False)
parity_ok, len(cols_sets)


(True, 1)

In [128]:
# 5 Load & concatenate all shards
dfs = []
for p in csvs:
    df = normalize_columns(read_csv(p))
    dfs.append(df)
raw = pd.concat(dfs, ignore_index=True)
raw.to_parquet(OUT_DIR/"_raw_concat_preview.parquet", index=False)  # breadcrumb
raw.shape, raw.columns.tolist()[:10]


((123743, 14),
 ['receipt_id',
  'receipt_number',
  'purchase_date',
  'purchase_time',
  'product_line',
  'ean',
  'product_name',
  'qty',
  'unit_price_gross',
  'vat_rate'])

In [129]:
# 6 Cast types & basic checks
df = cast_basic_types(raw)
chk = basic_checks(df)
pd.DataFrame([chk]).to_csv(OUT_DIR/"basic_checks.csv", index=False)
df.head(3)


Unnamed: 0,receipt_id,receipt_number,purchase_date,purchase_time,product_line,ean,product_name,qty,unit_price_gross,vat_rate,unit_price_net,discount,cashier,payment_method
0,31006967,nr: 100213,2025-09-01,07:07,KAJZERKA xxl 95g-C,,,2.0,0.79,5.0,0.75,,kasjer_22,Card
1,31007786,nr:69912,2025-09-01,08:37,BAGIETKA SZ JAJKO-C,,,1.0,11.99,5.0,11.42,,kasjer_0,Card
2,31007821,nr : 76852,2025-09-01,08:57,MLEKO 500g GOSTYN-C,5900691000000.0,SM Gostyń Mleko gostyńskie zagęszczone lekkie ...,1.0,6.99,5.0,6.66,,kasjer_6,Cash


In [130]:
# 7 Nulls by column
nulls = df.isna().mean().sort_values(ascending=False)
nulls.to_csv(OUT_DIR/"null_matrix.csv", header=["null_ratio"])
save_bar(nulls.head(25), "Null ratio by column (top 25)", PLOTS/"nulls_by_col.png")
nulls.head(15)


discount            0.999798
vat_rate            0.042572
product_name        0.004905
ean                 0.000533
receipt_number      0.000000
receipt_id          0.000000
product_line        0.000000
purchase_time       0.000000
purchase_date       0.000000
qty                 0.000000
unit_price_gross    0.000000
unit_price_net      0.000000
cashier             0.000000
payment_method      0.000000
dtype: float64

In [131]:
# 8 Duplicate checks
dup_line_count = int(df.duplicated().sum())
dup_cols_key = ["receipt_id","purchase_date","purchase_time","ean","product_name","qty","unit_price_gross"]
dup_key_count = None
if all(c in df.columns for c in dup_cols_key):
    dup_key_count = int(df.duplicated(subset=dup_cols_key).sum())

pd.DataFrame([{
    "duplicate_lines": dup_line_count,
    "duplicate_key_rows": dup_key_count
}]).to_csv(OUT_DIR/"duplicate_summary.csv", index=False)
dup_line_count, dup_key_count


(0, 0)

In [132]:
# 9 Value sanity (qty & price)
if "qty" in df.columns:
    save_hist(df["qty"], bins=50, title="Quantity distribution", outpath=PLOTS/"qty_hist.png")
if "unit_price_gross" in df.columns:
    save_box(df["unit_price_gross"], title="Unit price (gross) — boxplot (log)", outpath=PLOTS/"price_boxplot.png", log=True)


In [133]:
# 10 Timestamp parsing + slot mapping
tz = yaml.safe_load(CFG_SLOTS.read_text())["timezone"]
df = parse_timestamp(df, tz)
df = assign_slots(df, CFG_SLOTS)

slot_counts = df["slot_label"].value_counts(dropna=False)
slot_counts.to_csv(OUT_DIR/"slot_distribution.csv", header=["count"])
save_bar(slot_counts, "Receipts by slot (line-level incidence)", PLOTS/"receipts_by_slot.png")
slot_counts


  return pd.to_datetime(s, errors="coerce").dt.time
  return pd.to_datetime(s, errors="coerce").dt.time


slot_label
None                 65897
Lunch time           21451
After work           20655
Morning groceries     8543
Going to work         7153
Probably outliers       44
Name: count, dtype: int64

In [None]:
#  11. FC mapping coverage (final, simple + deterministic) 
# Load canonical
canonical_path = PROJECT_ROOT / "data" / "refs" / "zabka_food_corner_menu_canonical.csv"
canonical = pd.read_csv(canonical_path)

# Build mapping over UNIQUE product names
mapping = map_fc_products(df, canonical, threshold=70)

# Save the mapping
MAP_OUT = PROJECT_ROOT / "data" / "refs" / "auto_fc_mapping_from_menu.csv"
mapping.to_csv(MAP_OUT, index=False)
print("Saved mapping:", MAP_OUT)

#  Compute line-level coverage by joining back on normalized product_name 
df2 = df.copy()
df2["product_norm"] = normalize_text(df2["product_name"])
mp = mapping[["product_norm","is_food_corner_auto","match_category","best_match_item","score"]].drop_duplicates("product_norm")
joined = df2.merge(mp, on="product_norm", how="left")

line_coverage = joined["is_food_corner_auto"].notna().mean()
fc_rate       = joined["is_food_corner_auto"].fillna(False).mean()

print(f"Line-level mapping coverage: {line_coverage:.3f}")
print(f"Lines flagged as FC (auto):  {fc_rate:.3f}")

# Save summaries
pd.DataFrame([{
    "line_coverage": float(line_coverage),
    "fc_rate": float(fc_rate),
    "threshold": 70
}]).to_csv(OUT_DIR / "fc_coverage_summary.csv", index=False)

# Unmapped examples for inspection
unmapped = (joined.loc[joined["is_food_corner_auto"].isna(), ["product_name","product_line"]]
                 .drop_duplicates()
                 .head(50))
unmapped.to_csv(OUT_DIR / "fc_unmapped_examples.csv", index=False)
unmapped.head(10)
coverage = float(line_coverage)


Saved mapping: /home/alonbenach/project/invoice-analysis/data/refs/auto_fc_mapping_from_menu.csv
Line-level mapping coverage: 0.995
Lines flagged as FC (auto):  0.053


  fc_rate       = joined["is_food_corner_auto"].fillna(False).mean()


Unnamed: 0,product_name,product_line
0,,KAJZERKA xxl 95g-C
1,,BAGIETKA SZ JAJKO-C
3,,BAGIETKA WLOS KURCZ 225g-C
6,,KUB DO BUB TEA COCO MALIN-B
7,,SUSHI WRAP Z PASTA Z TUNC-C
8,,PEPPERON NA CIEPLO-B
9,,R LOD/NUGGETS 2700-8 RZEDAŻ
12,,PANINI Z WARZ I FETA 207g-B
14,,LIZ W PLYNIE J 40g-A
16,,STARTER PLUS INTERNET 5 ZL-A


In [None]:
# 12. Coverage + simple gauge
try:
    _ = line_coverage
    _ = fc_rate
except NameError:
    _sum = pd.read_csv(OUT_DIR / "fc_coverage_summary.csv")
    line_coverage = float(_sum.loc[0, "line_coverage"])
    fc_rate       = float(_sum.loc[0, "fc_rate"])

print(f"Line-level mapping coverage: {line_coverage:.3f}")
print(f"Lines flagged as FC (auto):  {fc_rate:.3f}")

# Save a simple “gauge” bar
save_bar(
    pd.Series({"mapped": line_coverage, "unmapped": 1 - line_coverage}),
    title="Line-level mapping coverage",
    outpath=PLOTS / "fc_line_coverage.png"
)

save_bar(
    pd.Series({"FC": fc_rate, "Non-FC": 1 - fc_rate}),
    title="Lines flagged as Food Corner (auto)",
    outpath=PLOTS / "fc_flag_rate.png"
)


Line-level mapping coverage: 0.995
Lines flagged as FC (auto):  0.053


In [137]:
# 13 Write audit summary (machine + human)
summary = {
    "rows": len(df),
    "cols": len(df.columns),
    "duplicates_lines": int(df.duplicated().sum()),
}
if "slot_label" in df.columns:
    summary["slot_unknown"] = int(df["slot_label"].isna().sum())

pd.DataFrame([summary]).to_csv(OUT_DIR/"health_summary.csv", index=False)

# Minimal human-readable markdown note (you can edit it manually after first run)
md = f"""# Data Health Summary

- Rows: {summary['rows']}
- Cols: {summary['cols']}
- Duplicate lines: {summary['duplicates_lines']}
- Slot unknown: {summary.get('slot_unknown','N/A')}

Artifacts:
- Nulls: outputs_large/audit/null_matrix.csv
- Duplicates: outputs_large/audit/duplicate_summary.csv
- Slot distribution plot: outputs_large/audit/plots/receipts_by_slot.png
- Unit price boxplot: outputs_large/audit/plots/price_boxplot.png
- FC coverage: outputs_large/audit/fc_coverage_summary.csv
"""
(Path(OUT_DIR)/"health_summary.md").write_text(md, encoding="utf-8")
"OK"


'OK'

In [142]:
# post process santity checks
# 10 random FC lines — sanity check they’re truly prepared items
joined.query("is_food_corner_auto == True")[["product_name","product_line","match_category","best_match_item","score"]].sample(10, random_state=1)

# 10 random non-FC lines — make sure no obvious FC slipped through
joined.query("is_food_corner_auto != True")[["product_name","product_line"]].sample(10, random_state=2)

# Top FC categories by count/value (preview for Stage 2)
fc = joined[joined["is_food_corner_auto"] == True]
fc.groupby("match_category").size().sort_values(ascending=False).head(10)

# FC share by time slot (early signal)
fc_share_by_slot = (
    joined.assign(is_fc=joined["is_food_corner_auto"].fillna(False))
          .groupby("slot_label")["is_fc"].mean()
          .sort_index()
)
fc_share_by_slot


  joined.assign(is_fc=joined["is_food_corner_auto"].fillna(False))


slot_label
After work           0.037328
Going to work        0.050748
Lunch time           0.066384
Morning groceries    0.076203
Probably outliers    0.068182
Name: is_fc, dtype: float64

In [143]:
fc

Unnamed: 0,receipt_id,receipt_number,purchase_date,purchase_time,product_line,ean,product_name,qty,unit_price_gross,vat_rate,unit_price_net,discount,cashier,payment_method,ts,hour_minute,slot_id,slot_label,product_norm,is_food_corner_auto,match_category,best_match_item,score
18,31012512,798476,2025-09-01,13:03,HOT-DOG Z KIELB. BEKON - GX-B,2.010006e+07,hot dog,1.0,6.99,8.0,6.47,,kasjer_11,Card,2025-01-09 13:03:00+01:00,13:03,3.0,Lunch time,hot dog,True,Hot Dog,Hot dog z parówką z szynki,100.0
21,31013370,nr: 631063,2025-09-01,14:21,HOT-DOG Z KABANOSEM GX-B,2.010006e+07,hot dog,1.0,6.99,8.0,6.47,,kasjer_2,Cash,2025-01-09 14:21:00+01:00,14:21,3.0,Lunch time,hot dog,True,Hot Dog,Hot dog z parówką z szynki,100.0
32,31016217,nr: 67544,2025-09-01,14:36,HOT DOG Z PAR. Z SZYN. XXL-B,2.010006e+07,hot dog,1.0,3.99,8.0,3.69,,kasjer_1,Cash,2025-01-09 14:36:00+01:00,14:36,3.0,Lunch time,hot dog,True,Hot Dog,Hot dog z parówką z szynki,100.0
38,31016668,496086,2025-09-01,15:13,HOT DOG PAR Z SZYN-B,2.010006e+07,hot dog,1.0,6.99,8.0,6.47,,kasjer_1211,Card,2025-01-09 15:13:00+01:00,15:13,3.0,Lunch time,hot dog,True,Hot Dog,Hot dog z parówką z szynki,100.0
47,31016878,nr:496094,2025-09-01,15:24,HOT DOG MAXX GRILL-B,2.010006e+07,hot dog,1.0,8.99,8.0,8.32,,kasjer_1211,Cash,2025-01-09 15:24:00+01:00,15:24,3.0,Lunch time,hot dog,True,Hot Dog,Hot dog z parówką z szynki,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123681,32666394,nr:656524,2025-09-19,09:36,HOT DOG MAXX-B,2.010006e+07,hot dog,1.0,8.99,8.0,8.32,,kasjer_15,Card,NaT,,,,hot dog,True,Hot Dog,Hot dog z parówką z szynki,100.0
123683,32666401,27470,2025-09-11,20:27,KANAPKA TROJK 200g-B,8.576029e+12,Kanapka trojkont jajko bekon Tomcio Paluch 200g,1.0,11.50,8.0,10.65,,kasjer_3,Cash,2025-11-09 20:27:00+01:00,20:27,4.0,After work,kanapka trojkont jajko bekon tomcio paluch 200g,True,Kanapki,Kanapka Gyros,70.0
123691,32666429,nr: 409301,2025-09-27,13:06,KANAPKA TROJ PRO K-C,8.586020e+12,Kanapka Trojkat Tomcio Paluch 210g,1.0,11.50,5.0,10.95,,kasjer_4,Card,NaT,,,,kanapka trojkat tomcio paluch 210g,True,Kanapki,Kanapka Gyros,70.0
123709,32666524,407072,2025-09-05,13:34,TOMCIO KAJZ KEBAB-C,8.586015e+12,Bagietka kebab Tomcio Paluch,1.0,6.99,5.0,6.66,,kasjer_5,Cash,2025-05-09 13:34:00+02:00,13:34,3.0,Lunch time,bagietka kebab tomcio paluch,True,Panini,Bagietka Cezar,72.0
