# Yelp photo-based star prediction â€” validation plots

This notebook reproduces what `plot_predictions.py` does, but shows outputs inline.

It assumes:
- `data/features/features.pt` exists
- `best_mil_model.pth` exists
- dataset JSON + photos are under `data/` as per `dataframes.py`


In [3]:
# Notebook bootstrap:
# - Make imports work without `pip install -e .`
# - Make relative paths (e.g., data/...) resolve correctly by forcing CWD to the repo root

import os
import sys
from pathlib import Path

ROOT = Path.cwd()

# Walk up a few levels until we find the project root (contains `src/`)
for _ in range(5):
    if (ROOT / "src").exists():
        break
    ROOT = ROOT.parent

SRC = ROOT / "src"
if not SRC.exists():
    raise FileNotFoundError(
        f"Could not locate project `src/` directory. Started from: {Path.cwd()} (resolved ROOT={ROOT})"
    )

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# Force working directory to project root so that `data/...` paths work
os.chdir(ROOT)

print("CWD =", Path.cwd())
print("SRC =", SRC)

import numpy as np
import matplotlib.pyplot as plt

from yelp_prediction import eval_utils as E


ROOT = c:\Users\matia\OneDrive\Desktop\Data Science Master\2.- FIRST SEMESTER\DATA SCIENCE\yelp-prediction
SRC  = c:\Users\matia\OneDrive\Desktop\Data Science Master\2.- FIRST SEMESTER\DATA SCIENCE\yelp-prediction\src


In [5]:
# Configuration
FEATURES_PATH = Path("data/features/features.pt")
MODEL_PATH = Path("best_mil_model.pth")
OUTDIR = Path("reports/figures_notebook")
MAX_PHOTOS = 3
BATCH_SIZE = 256
SEED = 42


In [6]:
# Load validation set
df = E.load_features_df()
_, val_df = E.make_train_val_split(df, seed=SEED, train_frac=0.8)

median = E.compute_median_stars(df)
features_dict = E.load_features_dict(FEATURES_PATH)
model = E.load_model(MODEL_PATH, median_stars=median)


FileNotFoundError: El sistema no puede encontrar la ruta especificada. (os error 3): data\dataset-json\yelp_academic_dataset_business.json

This error occurred with the following context stack:
	[1] 'ndjson scan'
	[2] 'filter'
	[3] 'join left'
	[4] 'join'
	[5] 'join left'
	[6] 'join'
	[7] 'select'
	[8] 'with_columns'
	[9] 'sink'


## Run inference

For notebook diagnostics, you can switch `sampling` to `deterministic` to remove randomness from photo selection.

In [None]:
payload = E.predict_val(
    val_df,
    features_dict=features_dict,
    model=model,
    max_photos=MAX_PHOTOS,
    batch_size=BATCH_SIZE,
    seed=SEED,
    sampling="random",  # or "deterministic"
)

mae, rmse = E.compute_metrics(payload["y_true"], payload["y_pred"])
mae, rmse


## Scatter: true vs predicted

In [None]:
y_true = payload["y_true"]
y_pred = payload["y_pred"]

plt.figure()
plt.scatter(y_true, y_pred, s=8, alpha=0.5)
plt.plot([1, 5], [1, 5])
plt.xlim(1, 5)
plt.ylim(1, 5)
plt.xlabel("True stars")
plt.ylabel("Predicted stars")
plt.title(f"True vs Predicted (val) | MAE={mae:.3f}, RMSE={rmse:.3f}")
plt.show()


## Scatter colored by available photos

In [None]:
n_avail = payload["n_available"]

plt.figure()
sc = plt.scatter(y_true, y_pred, c=np.clip(n_avail, 1, None), s=8, alpha=0.5, norm=plt.matplotlib.colors.LogNorm())
plt.colorbar(sc).set_label("Available photos (log scale)")
plt.plot([1, 5], [1, 5])
plt.xlim(1, 5)
plt.ylim(1, 5)
plt.xlabel("True stars")
plt.ylabel("Predicted stars")
plt.title("True vs Predicted colored by #photos")
plt.show()


## Error histogram

In [None]:
err = y_pred - y_true

plt.figure()
plt.hist(err, bins=50)
plt.xlabel("Prediction error (y_pred - y_true)")
plt.ylabel("Count")
plt.title("Error distribution (val)")
plt.show()


## Simple binned calibration view (bin true stars to 0.5)

In [None]:
bins = np.round(y_true * 2) / 2.0
uniq = np.unique(bins)
means = np.array([float(np.mean(y_pred[bins == u])) for u in uniq])

plt.figure()
plt.plot(uniq, means, marker="o")
plt.plot([1, 5], [1, 5])
plt.xlabel("True stars (binned to 0.5)")
plt.ylabel("Avg predicted stars")
plt.title("Binned calibration (val)")
plt.show()


## Save the same artifacts as the script (optional)

In [None]:
OUTDIR.mkdir(parents=True, exist_ok=True)

# CSV
E.save_predictions_csv(OUTDIR / "val_predictions.csv", payload=payload)

# PNGs (same as script output naming)
E.generate_all_plots(OUTDIR, payload=payload, mae=mae, rmse=rmse)

OUTDIR.resolve()
