# Dataset coverage & bias diagnostics

This notebook reproduces `diagnose_bias.py` and shows tables/plots inline.

It focuses on whether the model is trained/evaluated on a photo-biased sample.


In [None]:
# Notebook bootstrap:
# - Make imports work without `pip install -e .`
# - Make relative paths (e.g., data/...) resolve correctly by forcing CWD to the repo root

import os
import sys
from pathlib import Path

ROOT = Path.cwd()

# Walk up a few levels until we find the project root (contains `src/`)
for _ in range(5):
    if (ROOT / "src").exists():
        break
    ROOT = ROOT.parent

SRC = ROOT / "src"
if not SRC.exists():
    raise FileNotFoundError(
        f"Could not locate project `src/` directory. Started from: {Path.cwd()} (resolved ROOT={ROOT})"
    )

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# Force working directory to project root so that `data/...` paths work
os.chdir(ROOT)

print("CWD =", Path.cwd())
print("SRC =", SRC)

import polars as pl
import numpy as np
import matplotlib.pyplot as plt

from yelp_prediction import bias_utils as B


In [None]:
OUTDIR = Path("reports/diagnostics_notebook")
OUTDIR.mkdir(parents=True, exist_ok=True)

PRED_CSV = Path("reports/figures/val_predictions.csv")  # produced by plot_predictions
LOW_THR = 3.0
RESTAURANTS_ONLY = True


## Build the business-level coverage table

In [None]:
df = B.build_coverage_df(restaurants_only=RESTAURANTS_ONLY, low_thr=LOW_THR)
df.shape, df.head()


## Coverage summary (same metrics as the script)

In [None]:
# Reuse the existing writer, but also display the content
summary_path = OUTDIR / "coverage_summary.txt"
B.write_coverage_summary(summary_path, df, restaurants_only=RESTAURANTS_ONLY, low_thr=LOW_THR)
print(summary_path.read_text(encoding="utf-8"))


## Stars distribution: all vs with photos vs without photos

In [None]:
stars_all = df["stars_target"].to_numpy()
stars_with = df.filter(pl.col("has_photos"))["stars_target"].to_numpy()
stars_without = df.filter(~pl.col("has_photos"))["stars_target"].to_numpy()

plt.figure()
plt.hist(stars_all, bins=40, alpha=0.6, label="All")
plt.hist(stars_with, bins=40, alpha=0.6, label="With photos")
plt.hist(stars_without, bins=40, alpha=0.6, label="Without photos")
plt.xlabel("Stars (stars_target)")
plt.ylabel("Count")
plt.title("Stars distribution by photo presence")
plt.legend()
plt.show()


## Stars vs photo_count / review_count

In [None]:
plt.figure()
plt.scatter(df["photo_count"].to_numpy() + 1, df["stars_target"].to_numpy(), s=6, alpha=0.3)
plt.xscale("log")
plt.xlabel("photo_count + 1 (log scale)")
plt.ylabel("stars_target")
plt.title("Stars vs photo_count")
plt.show()

plt.figure()
plt.scatter(df["reviews_n"].to_numpy() + 1, df["stars_target"].to_numpy(), s=6, alpha=0.3)
plt.xscale("log")
plt.xlabel("reviews_n + 1 (log scale)")
plt.ylabel("stars_target")
plt.title("Stars vs number of reviews")
plt.show()


## If you have predictions, join and analyze error vs coverage

In [None]:
if not PRED_CSV.exists():
    print(f"Predictions CSV not found: {PRED_CSV}. Run plot_predictions first.")
else:
    merged = B.merge_with_predictions(df, PRED_CSV)
    merged.shape, merged.head()


### Abs error vs photo_count / reviews_n

In [None]:
if PRED_CSV.exists():
    plt.figure()
    plt.scatter(merged["photo_count"].fill_null(0).to_numpy() + 1, merged["abs_error"].to_numpy(), s=8, alpha=0.4)
    plt.xscale("log")
    plt.xlabel("photo_count + 1 (log scale)")
    plt.ylabel("abs_error")
    plt.title("Abs error vs photo_count")
    plt.show()

    plt.figure()
    plt.scatter(merged["reviews_n"].fill_null(0).to_numpy() + 1, merged["abs_error"].to_numpy(), s=8, alpha=0.4)
    plt.xscale("log")
    plt.xlabel("reviews_n + 1 (log scale)")
    plt.ylabel("abs_error")
    plt.title("Abs error vs number of reviews")
    plt.show()


### MAE by buckets (photo_count and reviews_n)

In [None]:
if PRED_CSV.exists():
    photo_edges = B.parse_edges("0,1,2,3,5,10,20,50,100,1000000")
    review_edges = B.parse_edges("0,1,5,10,20,50,100,200,500,1000,1000000")

    # Save CSVs (script-compatible)
    B.mae_by_bucket(OUTDIR / "mae_by_photo_bucket.csv", merged, col="photo_count", edges=photo_edges, bucket_name="photo_bucket")
    B.mae_by_bucket(OUTDIR / "mae_by_review_bucket.csv", merged, col="reviews_n", edges=review_edges, bucket_name="review_bucket")

    # Display inline
    print(pl.read_csv(OUTDIR / "mae_by_photo_bucket.csv"))
    print(pl.read_csv(OUTDIR / "mae_by_review_bucket.csv"))
