In [None]:
# Path bootstrap: make project root + src importable
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)


Project root: /home/alonbenach/project/invoice-analysis


In [None]:
# 1 Imports, constants, dirs
from pathlib import Path
import pandas as pd
import numpy as np
import yaml

from src.io_utils import list_csvs, read_csv, ensure_dir
from src.clean_utils import normalize_columns, cast_basic_types, parse_timestamp, assign_slots
from src.fc_map_utils import normalize_text  
from src.viz_utils import save_bar, save_hist, save_box

%load_ext autoreload
%autoreload 2

DATA_DIR   = PROJECT_ROOT / "data" / "invoices"
REF_DIR    = PROJECT_ROOT / "data" / "refs"
OUT_DIR    = PROJECT_ROOT / "outputs_large" / "core"
PLOTS      = OUT_DIR / "plots"
CFG_SLOTS  = PROJECT_ROOT / "config" / "slots.yaml"

ensure_dir(OUT_DIR); ensure_dir(PLOTS)
pd.options.display.max_columns = 200


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
# 2 Load and prep base dataframe
# 2A) Load all invoice shards quickly
csvs = list_csvs(DATA_DIR)
dfs = [read_csv(p) for p in csvs]
df = pd.concat(dfs, ignore_index=True)

# 2B) Normalize + cast
df = normalize_columns(df)
df = cast_basic_types(df)

# 2C) Timestamp + slot
tz = yaml.safe_load(CFG_SLOTS.read_text())["timezone"]
df = parse_timestamp(df, tz)
df = assign_slots(df, CFG_SLOTS)

# 2D) Convenience columns
if "qty" in df and "unit_price_gross" in df:
    df["line_value_gross"] = df["qty"] * df["unit_price_gross"]
else:
    df["line_value_gross"] = np.nan

# 2E) Normalized product key for joins
df["product_norm"] = normalize_text(df["product_name"])
len(df), df.head(3)


  return pd.to_datetime(s, errors="coerce").dt.time
  return pd.to_datetime(s, errors="coerce").dt.time


(123743,
    receipt_id receipt_number purchase_date purchase_time         product_line  \
 0    31006967     nr: 100213    2025-09-01         07:07   KAJZERKA xxl 95g-C   
 1    31007786       nr:69912    2025-09-01         08:37  BAGIETKA SZ JAJKO-C   
 2    31007821     nr : 76852    2025-09-01         08:57  MLEKO 500g GOSTYN-C   
 
             ean                                       product_name  qty  \
 0           NaN                                                NaN  2.0   
 1           NaN                                                NaN  1.0   
 2  5.900691e+12  SM Gostyń Mleko gostyńskie zagęszczone lekkie ...  1.0   
 
    unit_price_gross  vat_rate  unit_price_net  discount    cashier  \
 0              0.79       5.0            0.75       NaN  kasjer_22   
 1             11.99       5.0           11.42       NaN   kasjer_0   
 2              6.99       5.0            6.66       NaN   kasjer_6   
 
   payment_method                        ts hour_minute  slot_id  \
 

In [36]:
# 3 Load FC mapping and join (line level tagging)
# Load the mapping produced in Stage 1
MAP_PATH = REF_DIR / "auto_fc_mapping_from_menu.csv"
mp = pd.read_csv(MAP_PATH)

# Normalize key in mapping defensively
if "product_norm" not in mp.columns:
    mp["product_norm"] = normalize_text(mp["product_raw"] if "product_raw" in mp.columns else mp["product_name"])

mp_small = mp[["product_norm","is_food_corner_auto","match_category","best_match_item","score"]].drop_duplicates("product_norm")

# Join to lines
dfj = df.merge(mp_small, on="product_norm", how="left")

# Final FC flag (bool) and a safe category (FC only)
dfj["is_fc"] = dfj["is_food_corner_auto"].fillna(False)
dfj["fc_category"] = np.where(dfj["is_fc"], dfj["match_category"], pd.NA)

# Basic preview
dfj[["receipt_id","product_name","product_line","is_fc","fc_category","slot_label"]].head(10)


  dfj["is_fc"] = dfj["is_food_corner_auto"].fillna(False)


Unnamed: 0,receipt_id,product_name,product_line,is_fc,fc_category,slot_label
0,31006967,,KAJZERKA xxl 95g-C,False,,Going to work
1,31007786,,BAGIETKA SZ JAJKO-C,False,,Going to work
2,31007821,SM Gostyń Mleko gostyńskie zagęszczone lekkie ...,MLEKO 500g GOSTYN-C,False,,Going to work
3,31008328,,BAGIETKA WLOS KURCZ 225g-C,False,,Morning groceries
4,31008396,Monster Energy Gazowany napój energetyczny 500 ml,"NAPOJ MONSTER 0,5l-A",False,,Morning groceries
5,31008463,Sierpc Ser królewski plastry 135 g,SER KROLEWSKI 135g-C,False,,Morning groceries
6,31008521,,KUB DO BUB TEA COCO MALIN-B,False,,Morning groceries
7,31008687,,SUSHI WRAP Z PASTA Z TUNC-C,False,,Morning groceries
8,31008698,,PEPPERON NA CIEPLO-B,False,,Morning groceries
9,31008977,,R LOD/NUGGETS 2700-8 RZEDAŻ,False,,Morning groceries


In [37]:
# 4 Receipt-level aggregations
# FC per receipt
by_receipt = (dfj.groupby("receipt_id")["is_fc"]
                .agg(receipt_has_fc=lambda s: bool(s.any()),
                     fc_line_count="sum")
                .reset_index())

# Overall receipt share with any FC
receipt_fc_share = by_receipt["receipt_has_fc"].mean()
print(f"% receipts with any FC: {receipt_fc_share:.3%}")

# Save
by_receipt.to_csv(OUT_DIR/"receipt_fc_summary.csv", index=False)
pd.DataFrame([{"receipt_fc_share": float(receipt_fc_share)}]).to_csv(OUT_DIR/"kpi_receipt_fc_share.csv", index=False)

# Bar gauge
save_bar(pd.Series({"has FC": receipt_fc_share, "no FC": 1-receipt_fc_share}),
         "Receipts containing Food Corner (share)",
         PLOTS/"receipt_fc_share.png")


% receipts with any FC: 5.270%


In [None]:
# 5 Time KPIs: slot & weekday
# Line-level FC share by slot
fc_by_slot = (dfj.assign(is_fc=dfj["is_fc"].astype(bool))
                .groupby("slot_label")["is_fc"]
                .mean().sort_index())
fc_by_slot.to_csv(OUT_DIR/"fc_share_by_slot.csv", header=["fc_share"])
save_bar(fc_by_slot, "FC share by time slot (line-level)", PLOTS/"fc_share_by_slot.png")

# Receipt-level FC presence by slot (any FC on the receipt in that slot)
# First, attach slot to receipts (use first item’s slot per receipt as proxy)
slot_per_receipt = (dfj.dropna(subset=["slot_label"])
                      .groupby("receipt_id")["slot_label"].agg(lambda s: s.iloc[0]))
r = by_receipt.join(slot_per_receipt, on="receipt_id")
receipt_fc_by_slot = r.groupby("slot_label")["receipt_has_fc"].mean().sort_index()
receipt_fc_by_slot.to_csv(OUT_DIR/"receipt_fc_share_by_slot.csv", header=["receipt_fc_share"])
save_bar(receipt_fc_by_slot, "Receipts with FC by time slot", PLOTS/"receipt_fc_share_by_slot.png")

# Weekday x slot heatmap (line count of FC)
weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
wd = dfj["ts"].dt.weekday
dfj["weekday"] = wd.map(lambda x: weekday_names[int(x)] if pd.notna(x) else "Unknown")
fc_heat = (dfj[dfj["is_fc"]]
           .groupby(["weekday","slot_label"])
           .size().unstack(fill_value=0))

# Save CSV
fc_heat.to_csv(OUT_DIR/"fc_count_heatmap_weekday_slot.csv")

# Plot heatmap inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6))
im = ax.imshow(fc_heat.values, aspect="auto")
ax.set_yticks(range(len(fc_heat.index))); ax.set_yticklabels(fc_heat.index)
ax.set_xticks(range(len(fc_heat.columns))); ax.set_xticklabels(fc_heat.columns, rotation=45, ha="right")
ax.set_title("FC line count — weekday × slot")
fig.colorbar(im, ax=ax)
plt.tight_layout()
plt.savefig(PLOTS/"fc_count_heatmap_weekday_slot.png", dpi=150)
plt.close(fig)
"PLOTS saved"


'PLOTS saved'

In [39]:
# 6 Top FC items (units & value)
top_fc_units = (dfj[dfj["is_fc"]]
                .groupby(["best_match_item","fc_category"], dropna=False)["qty"]
                .sum().sort_values(ascending=False).head(25).reset_index(name="units"))
top_fc_value = (dfj[dfj["is_fc"]]
                .groupby(["best_match_item","fc_category"], dropna=False)["line_value_gross"]
                .sum().sort_values(ascending=False).head(25).reset_index(name="value_gross"))

top_fc_units.to_csv(OUT_DIR/"top_fc_items_by_units.csv", index=False)
top_fc_value.to_csv(OUT_DIR/"top_fc_items_by_value.csv", index=False)

# Quick bar plots
save_bar(top_fc_units.set_index("best_match_item")["units"][:15],
         "Top FC items by units", PLOTS/"top_fc_by_units.png")
save_bar(top_fc_value.set_index("best_match_item")["value_gross"][:15],
         "Top FC items by gross value", PLOTS/"top_fc_by_value.png")

len(top_fc_units), len(top_fc_value)


(25, 25)

In [None]:
#7. Co-purchase (what sells *with* FC) - doesn't work atm, under review

# Ensure boolean
dfj["is_fc"] = dfj["is_fc"].astype(bool)

# Per-receipt counts
rec = (dfj.groupby("receipt_id")
          .agg(total_lines=("is_fc", "size"),
               fc_lines=("is_fc", "sum"))
          .assign(nonfc_lines=lambda d: d["total_lines"] - d["fc_lines"])
          .reset_index())

# Keep only receipts that have BOTH FC and non-FC
rec_both = set(rec.loc[(rec["fc_lines"] > 0) & (rec["nonfc_lines"] > 0), "receipt_id"])

# Subset to those receipts
df_both = dfj[dfj["receipt_id"].isin(rec_both)].copy()

# Non-FC items on those receipts → companions
cop = df_both[~df_both["is_fc"]].copy()

# If product_line is missing, label it for grouping
cop["product_line"] = cop["product_line"].fillna("UNKNOWN")

# Top companions by product_line and by product_name
cop_line = (cop.groupby("product_line")["qty"].sum()
              .sort_values(ascending=False)
              .head(25))
cop_name = (cop.groupby("product_name")["qty"].sum()
              .sort_values(ascending=False)
              .head(25))

# Save tables
cop_line.to_csv(OUT_DIR/"copurchase_top_product_lines.csv", header=["qty"])
cop_name.to_csv(OUT_DIR/"copurchase_top_product_names.csv", header=["qty"])

# Plot only if non-empty (save_bar already guards, but we’ll skip too)
if not cop_line.empty:
    save_bar(cop_line, "Co-purchase with FC (by product_line)", PLOTS/"copurchase_top_product_lines.png")
if not cop_name.empty:
    save_bar(cop_name, "Co-purchase with FC (by product_name)", PLOTS/"copurchase_top_product_names.png")

# Diagnostics printout so we can see counts at a glance
print({
    "receipts_total": int(rec.shape[0]),
    "receipts_with_fc": int((rec["fc_lines"] > 0).sum()),
    "receipts_with_both_fc_and_nonfc": int(len(rec_both)),
    "cop_rows": int(cop.shape[0]),
    "top_lines": int(cop_line.shape[0]),
    "top_names": int(cop_name.shape[0]),
})


{'receipts_total': 123743, 'receipts_with_fc': 6521, 'receipts_with_both_fc_and_nonfc': 0, 'cop_rows': 0, 'top_lines': 0, 'top_names': 0}


In [41]:
#8 FC mix by slot and weekday (value & units)
# Value share of FC per slot
value_by_slot = dfj.groupby("slot_label")["line_value_gross"].sum()
value_by_slot_fc = dfj[dfj["is_fc"]].groupby("slot_label")["line_value_gross"].sum()
value_share_fc_slot = (value_by_slot_fc / value_by_slot).fillna(0)
value_share_fc_slot.to_csv(OUT_DIR/"value_share_fc_by_slot.csv", header=["fc_value_share"])
save_bar(value_share_fc_slot, "FC value share by slot", PLOTS/"value_share_fc_by_slot.png")

# Units share per weekday
units_by_wd = dfj.groupby("weekday")["qty"].sum()
units_by_wd_fc = dfj[dfj["is_fc"]].groupby("weekday")["qty"].sum()
units_share_fc_wd = (units_by_wd_fc / units_by_wd).fillna(0)
units_share_fc_wd = units_share_fc_wd.reindex(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"], fill_value=0)
units_share_fc_wd.to_csv(OUT_DIR/"units_share_fc_by_weekday.csv", header=["fc_units_share"])
save_bar(units_share_fc_wd, "FC units share by weekday", PLOTS/"units_share_fc_by_weekday.png")

"value/units shares saved"


'value/units shares saved'

In [42]:
# 9 Export a slim analysis parquet for Matias
cols = ["receipt_id","ts","slot_label","weekday",
        "product_name","product_line","qty","unit_price_gross","line_value_gross",
        "is_fc","fc_category","best_match_item","score"]
slim = dfj[cols].copy()
slim.to_parquet(OUT_DIR/"core_slim.parquet", index=False)
"core_slim.parquet saved"


'core_slim.parquet saved'