## SkewSentry Tutorial — NYC Taxi (Hugging Face)

This tutorial shows how to use SkewSentry with a public dataset loaded via `datasets` (Hugging Face). We adapt the dataset to the required schema, then run the parity check.

Artifacts will be written under `examples/nyc_taxi/artifacts/`.


In [None]:
# Setup and load a small HF dataset split
from pathlib import Path
import sys
import pandas as pd
from datasets import load_dataset

from skewsentry.spec import FeatureSpec
from skewsentry.adapters.python_func import PythonFunctionAdapter
from skewsentry.runner import run_check

REPO = Path('/Users/yasserelhaddar/SkewSentry')
EX_DIR = REPO / 'examples' / 'nyc_taxi'
(EX_DIR / 'artifacts').mkdir(exist_ok=True)

# Load a small NYC taxi sample
# Using a stand-in dataset: nyc_taxi_dataset might be large; we pick a small split
# Here we use 'nyc_taxi_2014' like datasets if available; otherwise, fallback to a small tabular dataset
try:
    ds = load_dataset('nyc_taxi_2014', split='train[:200]')
except Exception:
    ds = load_dataset('nyc_taxi_2014', 'green', split='train[:200]', trust_remote_code=True)

df_raw = ds.to_pandas()

# Adapt to expected columns: user_id, ts, price, qty, country
# We'll map passenger_count as qty, total_amount as price, a synthetic country, and pickup_datetime as ts
import numpy as np
np.random.seed(0)

if 'total_amount' not in df_raw.columns:
    # Fallback column heuristics
    price_col = next((c for c in df_raw.columns if 'amount' in c), df_raw.columns[0])
else:
    price_col = 'total_amount'

if 'passenger_count' not in df_raw.columns:
    qty_col = next((c for c in df_raw.columns if 'passenger' in c), df_raw.columns[1])
else:
    qty_col = 'passenger_count'

if 'pickup_datetime' not in df_raw.columns:
    ts_col = next((c for c in df_raw.columns if 'pickup' in c and 'date' in c), df_raw.columns[2])
else:
    ts_col = 'pickup_datetime'

user_id = np.arange(len(df_raw)) % 50
country = np.random.choice(['US', 'UK', 'DE'], size=len(df_raw))

base_df = pd.DataFrame({
    'user_id': user_id.astype(int),
    'ts': pd.to_datetime(df_raw[ts_col], errors='coerce'),
    'price': pd.to_numeric(df_raw[price_col], errors='coerce'),
    'qty': pd.to_numeric(df_raw[qty_col], errors='coerce').fillna(1).astype(int),
    'country': country,
}).dropna(subset=['ts', 'price'])

# Use the same example feature functions
SIMPLE_DIR = REPO / 'examples' / 'simple'
sys.path.insert(0, str(SIMPLE_DIR))

spec = FeatureSpec.from_yaml(str(EX_DIR / 'features.yml'))
offline = PythonFunctionAdapter('offline_features:build_features')
online = PythonFunctionAdapter('online_features:get_features')

report = run_check(
    spec=spec,
    data=base_df,
    offline=offline,
    online=online,
    sample=200,
    html_out=str(EX_DIR / 'artifacts' / 'parity_report.html'),
    json_out=str(EX_DIR / 'artifacts' / 'parity_report.json'),
)

report.ok, report.summary['failing_features']
