In [1]:
# download and unzip the json dataset
!curl -L -A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -o json.zip https://business.yelp.com/external-assets/files/Yelp-JSON.zip
!unzip -o json.zip -d ./temp-json
!mkdir -p ./dataset-json
!tar -xvf "./temp-json/Yelp JSON/yelp_dataset.tar" -C ./dataset-json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4144M  100 4144M    0     0  20.7M      0  0:03:19  0:03:19 --:--:-- 22.3M
Archive:  json.zip
   creating: ./temp-json/Yelp JSON/
  inflating: ./temp-json/Yelp JSON/Yelp Dataset Documentation & ToS copy.pdf  
  inflating: ./temp-json/__MACOSX/Yelp JSON/._Yelp Dataset Documentation & ToS copy.pdf  
  inflating: ./temp-json/Yelp JSON/yelp_dataset.tar  
  inflating: ./temp-json/__MACOSX/Yelp JSON/._yelp_dataset.tar  
Dataset_User_Agreement.pdf
yelp_academic_dataset_business.json
yelp_academic_dataset_checkin.json
yelp_academic_dataset_review.json
yelp_academic_dataset_tip.json
yelp_academic_dataset_user.json


In [2]:
# download and unzip the photos dataset
!curl -L -A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -o photos.zip https://business.yelp.com/external-assets/files/Yelp-Photos.zip
!unzip -o photos.zip -d ./temp-photos
!mkdir -p ./dataset-photos
!tar -xf "./temp-photos/Yelp Photos/yelp_photos.tar" -C ./dataset-photos

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7102M  100 7102M    0     0  22.5M      0  0:05:15  0:05:15 --:--:-- 23.3M
Archive:  photos.zip
   creating: ./temp-photos/Yelp Photos/
  inflating: ./temp-photos/Yelp Photos/yelp_photos.tar  
  inflating: ./temp-photos/__MACOSX/Yelp Photos/._yelp_photos.tar  
  inflating: ./temp-photos/Yelp Photos/Yelp Dataset Documentation & ToS.pdf  
  inflating: ./temp-photos/__MACOSX/Yelp Photos/._Yelp Dataset Documentation & ToS.pdf  


In [3]:
# install deps
!pip install polars



In [4]:
import polars as pl

json_dataset_path = "./dataset-json"
df_businesses = pl.read_ndjson(f"{json_dataset_path}/yelp_academic_dataset_business.json")
print(df_businesses)

df_reviews = pl.read_ndjson(f"{json_dataset_path}/yelp_academic_dataset_review.json")
print(df_reviews)

df_photos = pl.read_ndjson(f"./dataset-photos/photos.json")
print(df_photos)


shape: (150_346, 14)
┌────────────┬───────────┬───────────┬───────────┬───┬─────────┬───────────┬───────────┬───────────┐
│ business_i ┆ name      ┆ address   ┆ city      ┆ … ┆ is_open ┆ attribute ┆ categorie ┆ hours     │
│ d          ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---     ┆ s         ┆ s         ┆ ---       │
│ ---        ┆ str       ┆ str       ┆ str       ┆   ┆ i64     ┆ ---       ┆ ---       ┆ struct[7] │
│ str        ┆           ┆           ┆           ┆   ┆         ┆ struct[33 ┆ str       ┆           │
│            ┆           ┆           ┆           ┆   ┆         ┆ ]         ┆           ┆           │
╞════════════╪═══════════╪═══════════╪═══════════╪═══╪═════════╪═══════════╪═══════════╪═══════════╡
│ Pns2l4eNsf ┆ Abby Rapp ┆ 1616      ┆ Santa     ┆ … ┆ 0       ┆ {null,nul ┆ Doctors,  ┆ null      │
│ O8kk83dixA ┆ oport,    ┆ Chapala   ┆ Barbara   ┆   ┆         ┆ l,null,nu ┆ Tradition ┆           │
│ 6A         ┆ LAC, CMQ  ┆ St, Ste 2 ┆           ┆   ┆         ┆ ll,nu

In [5]:
df_photos_agg = df_photos.group_by("business_id").agg([
    pl.col("photo_id").alias("photo_ids"),
    pl.col("label").alias("photo_labels"),
    pl.len().alias("photo_count")
])


df_exact_stars = df_reviews.lazy().select([
    pl.col("business_id"),
    pl.col("stars")
]).group_by("business_id").agg(
    pl.col("stars").mean().alias("exact_stars")
).collect()

df_final = (
    df_businesses
    .filter(
      pl.col("categories").str.contains("Restaurants")
    )
    .join(df_photos_agg, on="business_id", how="inner")
    .join(df_exact_stars, on="business_id", how="inner")
    .select(
        # other interesting fields to select:
        # name, categories, attributes.RestaurantsPriceRange2, latitude/longitude, city, state, (etc)
        pl.col("business_id"),
        # pl.col("stars"),
        pl.col("exact_stars").alias("stars"),
        pl.col("review_count"),
        pl.col("photo_count"),
        pl.col("photo_ids"),
    )
    .with_columns([
      pl.col("photo_ids").fill_null([]),
      pl.col("photo_count").fill_null(0)
    ])

)


print(df_final)

shape: (29_374, 5)
┌────────────────────────┬──────────┬──────────────┬─────────────┬─────────────────────────────────┐
│ business_id            ┆ stars    ┆ review_count ┆ photo_count ┆ photo_ids                       │
│ ---                    ┆ ---      ┆ ---          ┆ ---         ┆ ---                             │
│ str                    ┆ f64      ┆ i64          ┆ u32         ┆ list[str]                       │
╞════════════════════════╪══════════╪══════════════╪═════════════╪═════════════════════════════════╡
│ Ojd913yxClnEwQSthSd8XQ ┆ 3.625    ┆ 8            ┆ 1           ┆ ["_uKWDS7za4HEMLnvgy_UeA"]      │
│ A0DYN5wxygYnX_sL3HygvA ┆ 4.359528 ┆ 491          ┆ 41          ┆ ["nnmbn0ztC4d4YMB-i_FdCQ", "_W… │
│ Hr6aM0s-woIJaW6DJSbvNQ ┆ 3.430233 ┆ 573          ┆ 45          ┆ ["13PP-x_ouyu7ZwUoHo-oWA", "LJ… │
│ UwwimB6EoAsKJ6BCVBifVw ┆ 3.960976 ┆ 200          ┆ 1           ┆ ["MbnUB0xrsKvEPr6GBjj2oA"]      │
│ RqqCs9epqlZYAWEMXXuWtA ┆ 3.94636  ┆ 501          ┆ 9           ┆ ["h7H

In [6]:
# WARNING: vibe coded

import torch
import torch.nn as nn
import numpy as np
import pandas as pd # Used for the dataset logic
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
import os

# CONFIG
IMG_DIR = "./dataset-photos/photos/"
MAX_PHOTOS = 3        # Number of photos to sample per business
BATCH_SIZE = 32       # T4 handles 32 easily with frozen backbone
LR = 0.001
EPOCHS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Running on: {DEVICE}")

# Convert Polars to Pandas for easier indexing if needed
if hasattr(df_final, "to_pandas"):
    df = df_final.to_pandas()
else:
    df = df_final

# quick check
print(f"Dataset size: {len(df)}")

Running on: cuda
Dataset size: 29374


In [7]:
# WARNING: vibe coded
class YelpBagDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.data = dataframe.to_dict('records') # Fast access
        self.img_dir = img_dir
        self.transform = transform
        # Create a black placeholder image once for missing files
        self.placeholder = torch.zeros(3, 224, 224)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        photo_ids = row['photo_ids']
        label = torch.tensor(row['stars'], dtype=torch.float32)

        # 1. SAMPLING: Pick K photos (Randomly sample if > K, repeat if < K)
        if len(photo_ids) >= MAX_PHOTOS:
            selected = np.random.choice(photo_ids, MAX_PHOTOS, replace=False)
        else:
            selected = np.random.choice(photo_ids, MAX_PHOTOS, replace=True)

        # 2. LOAD IMAGES
        images = []
        for pid in selected:
            path = os.path.join(self.img_dir, f"{pid}.jpg")
            try:
                with Image.open(path) as img:
                    img = img.convert('RGB')
                    if self.transform:
                        img = self.transform(img)
                    images.append(img)
            except Exception:
                # Fallback if image is missing/corrupt
                images.append(self.placeholder)

        # Stack into [3, 3, 224, 224] tensor
        return torch.stack(images), label

# Standard ImageNet stats
tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Split 80/20
mask = np.random.rand(len(df)) < 0.8
train_df = df[mask]
val_df = df[~mask]

train_loader = DataLoader(YelpBagDataset(train_df, IMG_DIR, tfms),
                          batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(YelpBagDataset(val_df, IMG_DIR, tfms),
                        batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [8]:
median_stars = df_final["stars"].median()
baseline_mae = (df_final["stars"] - median_stars).abs().mean()

print(f"Baseline to beat: {baseline_mae:.4f} (median: {median_stars})")

Baseline to beat: 0.5999 (median: 3.766463346598728)


In [9]:
# WARNING: vibe coded
class RatingPredictor(nn.Module):
    def __init__(self):
        super().__init__()
        # Load ResNet50 with modern weights
        base_model = models.efficientnet_v2_s(
            weights=models.EfficientNet_V2_S_Weights.DEFAULT
        )

        # Strip the last layer (fc)
        self.backbone = base_model.features

        # FREEZE BACKBONE (Crucial for speed)
        for param in self.backbone.parameters():
            param.requires_grad = False

        # Regression Head
        self.head = nn.Sequential(
            nn.Linear(1280, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 1) # Output 1 star rating
        )

        # careful: this is technicall incorrect: we're using the median of the
        # whole dataset, not just our split
        initial_bias = torch.logit(torch.tensor((median_stars - 1) / 4))
        nn.init.constant_(self.head[-1].bias, initial_bias)

        self.pool = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        # x shape: [Batch, K_Photos, Channels, H, W]
        b, k, c, h, w = x.shape

        # Flatten batch and K dimensions to pass through ResNet
        x = x.view(b * k, c, h, w)

        x = self.backbone(x)

        # Features: [B*K, 1280, 1, 1] -> [B*K, 1280]
        # Reshape back to [B, K, Features]
        features = self.pool(x).squeeze().view(b, k, -1)

        # MEAN POOLING: Average features of the 3 photos
        avg_features = torch.mean(features, dim=1)

        # we force the output to be 1<=x<=5
        raw_output = self.head(avg_features).squeeze()
        return torch.sigmoid(raw_output) * 4 + 1


model = RatingPredictor().to(DEVICE)
optimizer = torch.optim.Adam(model.head.parameters(), lr=LR)
criterion = nn.MSELoss()

Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_s-dd5fe13b.pth


100%|██████████| 82.7M/82.7M [00:00<00:00, 127MB/s]


In [None]:
# WARNING: vibe coded
from tqdm.notebook import tqdm

print("Starting Training...")

for epoch in range(EPOCHS):
    # --- TRAIN ---
    model.train()
    total_loss = 0

    # Progress bar for sanity check
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for imgs, targets in pbar:
        imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)

        optimizer.zero_grad()
        preds = model(imgs)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({'loss': f"{loss.item():.4f}"})

    print(f"Epoch {epoch+1} Avg Loss: {total_loss/len(train_loader):.4f}")

    # --- QUICK EVAL ---
    model.eval()
    errors = []

    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)
            preds = model(imgs)
            errors.extend(torch.abs(preds - targets).cpu().numpy())

    print(f"--> Validation MAE: {np.mean(errors):.4f} stars\n")

print("done!")

Starting Training...


Epoch 1/5:   0%|          | 0/734 [00:00<?, ?it/s]

Epoch 1 Avg Loss: 0.4453
--> Validation MAE: 0.5191 stars



Epoch 2/5:   0%|          | 0/734 [00:00<?, ?it/s]

Epoch 2 Avg Loss: 0.4115
--> Validation MAE: 0.5006 stars



Epoch 3/5:   0%|          | 0/734 [00:00<?, ?it/s]