## SkewSentry Tutorial — NYC Taxi (Hugging Face)

This tutorial shows how to use SkewSentry with a public dataset loaded via `datasets` (Hugging Face). We adapt the dataset to the required schema, then run the parity check.

Artifacts will be written under `examples/nyc_taxi/artifacts/`.


In [None]:
# Setup and load a small HF dataset file (CSV/Parquet)
from pathlib import Path
import sys
import pandas as pd
from datasets import load_dataset

from skewsentry.spec import FeatureSpec
from skewsentry.adapters.python_func import PythonFunctionAdapter
from skewsentry.runner import run_check

REPO = Path('/Users/yasserelhaddar/SkewSentry')
EX_DIR = REPO / 'examples' / 'nyc_taxi'
(EX_DIR / 'artifacts').mkdir(exist_ok=True)

# Replace with a real file URL from a HF dataset (Files tab -> copy link)
DATA_FILE = 'https://huggingface.co/datasets/nyc-taxi-trip-duration/resolve/main/data/sample.parquet'

# Auto-detect format and load a small slice
if DATA_FILE.endswith('.parquet'):
    ds = load_dataset('parquet', data_files=DATA_FILE, split='train[:200]')
elif DATA_FILE.endswith('.csv'):
    ds = load_dataset('csv', data_files=DATA_FILE, split='train[:200]')
else:
    raise ValueError('Set DATA_FILE to a .parquet or .csv URL from the Hub')

df_raw = ds.to_pandas()

# Adapt to expected columns: user_id, ts, price, qty, country
import numpy as np
np.random.seed(0)

# Heuristic column mapping; adjust as needed per your file columns
price_col = next((c for c in df_raw.columns if 'amount' in c or 'fare' in c or 'price' in c), df_raw.columns[0])
qty_col   = next((c for c in df_raw.columns if 'passenger' in c or 'qty' in c or 'count' in c), df_raw.columns[1])
ts_col    = next((c for c in df_raw.columns if 'time' in c or 'date' in c or 'pickup' in c), df_raw.columns[2])

user_id = (np.arange(len(df_raw)) % 50).astype(int)
country = np.random.choice(['US', 'UK', 'DE'], size=len(df_raw))

base_df = pd.DataFrame({
    'user_id': user_id,
    'ts': pd.to_datetime(df_raw[ts_col], errors='coerce'),
    'price': pd.to_numeric(df_raw[price_col], errors='coerce'),
    'qty': pd.to_numeric(df_raw[qty_col], errors='coerce').fillna(1).astype(int),
    'country': country,
}).dropna(subset=['ts', 'price'])

# Use the same example feature functions
SIMPLE_DIR = REPO / 'examples' / 'simple'
if str(SIMPLE_DIR) not in sys.path:
    sys.path.insert(0, str(SIMPLE_DIR))

spec = FeatureSpec.from_yaml(str(EX_DIR / 'features.yml'))
offline = PythonFunctionAdapter('offline_features:build_features')
online = PythonFunctionAdapter('online_features:get_features')

report = run_check(
    spec=spec,
    data=base_df,
    offline=offline,
    online=online,
    sample=200,
    html_out=str(EX_DIR / 'artifacts' / 'parity_report.html'),
    json_out=str(EX_DIR / 'artifacts' / 'parity_report.json'),
)

report.ok, report.summary['failing_features']


  from .autonotebook import tqdm as notebook_tqdm
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'nyc_taxi_2014' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


DatasetNotFoundError: Dataset 'nyc_taxi_2014' doesn't exist on the Hub or cannot be accessed.