# Feast Feature Engineering


## Setup


In [None]:
%pip install -q "feast[postgres,ray]==0.56.0" pandas pyarrow psycopg2-binary yamlmagic
%load_ext yamlmagic


## Configuration


In [None]:
%%yaml parameters

# =============================================================================
# Paths (must match training notebook: /shared/data)
# =============================================================================
feature_repo: ../feature_repo
data_dir: /shared/data

# =============================================================================
# Data Generation
# =============================================================================
data:
  start_date: "2010-02-05"
  weeks: 143
  stores: 45
  departments: 99
  seed: 42

# =============================================================================
# Feast Materialize Window
# =============================================================================
materialize:
  start: "2010-02-05T00:00:00"
  end: "2013-01-01T00:00:00"

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
from pathlib import Path

FEATURE_REPO = Path(parameters['feature_repo']).resolve()
DATA_DIR = Path(parameters['data_dir'])


## Generate Sales Data


In [None]:
np.random.seed(parameters['data']['seed'])
base_date = datetime.fromisoformat(parameters['data']['start_date']).replace(tzinfo=timezone.utc)
weeks = parameters['data']['weeks']
stores = parameters['data']['stores']
depts = parameters['data']['departments']

records = []
for week in range(weeks):
    week_date = base_date + timedelta(weeks=week)
    week_of_year = week_date.isocalendar()[1]
    month = week_date.month
    quarter = (month - 1) // 3 + 1
    
    seasonal = 1 + 0.3 * np.sin(2 * np.pi * week_of_year / 52)
    is_holiday = 1 if week_of_year in [6, 36, 47, 51, 52] else 0
    holiday_factor = 1.5 if is_holiday else 1.0
    
    for store_id in range(1, stores + 1):
        store_base = 15000 + store_id * 500
        store_type = ["A", "B", "C"][store_id % 3]
        store_size = {"A": 200000, "B": 150000, "C": 100000}[store_type]
        
        for dept_id in range(1, depts + 1):
            dept_factor = 0.2 + (dept_id % 10) * 0.1
            weekly_sales = max(0, store_base * dept_factor * seasonal * holiday_factor + np.random.normal(0, 1000))
            
            has_markdown = 1 if np.random.random() > 0.7 else 0
            total_markdown = sum([np.random.uniform(0, x) if has_markdown else 0 for x in [5000, 2000, 500, 1000, 3000]])
            
            records.append({
                "store": store_id, "dept": dept_id, "date": week_date,
                "weekly_sales": round(weekly_sales, 2), "is_holiday": is_holiday,
                "week_of_year": week_of_year, "month": month, "quarter": quarter,
                "temperature": round(50 + 30 * np.sin(2 * np.pi * week_of_year / 52) + np.random.normal(0, 5), 1),
                "fuel_price": round(2.5 + 0.5 * np.sin(2 * np.pi * week / 52) + np.random.normal(0, 0.1), 2),
                "cpi": round(210 + week * 0.05 + np.random.normal(0, 1), 2),
                "unemployment": round(max(4, 8 - week * 0.02 + np.random.normal(0, 0.3)), 1),
                "total_markdown": round(total_markdown, 2), "has_markdown": has_markdown,
                "store_type": store_type, "store_size": store_size
            })

df = pd.DataFrame(records)
df = df.sort_values(["store", "dept", "date"]).reset_index(drop=True)

# Add lag features
for lag in [1, 2, 4]:
    df[f"sales_lag_{lag}"] = df.groupby(["store", "dept"])["weekly_sales"].shift(lag)
df["sales_rolling_mean_4"] = df.groupby(["store", "dept"])["weekly_sales"].transform(lambda x: x.rolling(4, min_periods=1).mean())
df["sales_rolling_mean_12"] = df.groupby(["store", "dept"])["weekly_sales"].transform(lambda x: x.rolling(12, min_periods=1).mean())
df["sales_rolling_std_4"] = df.groupby(["store", "dept"])["weekly_sales"].transform(lambda x: x.rolling(4, min_periods=1).std())

len(df)

In [None]:
DATA_DIR.mkdir(parents=True, exist_ok=True)

sales_cols = ["store", "dept", "date", "weekly_sales", "is_holiday", "week_of_year", "month", "quarter",
              "sales_lag_1", "sales_lag_2", "sales_lag_4", "sales_rolling_mean_4", "sales_rolling_mean_12", "sales_rolling_std_4"]
store_cols = ["store", "dept", "date", "temperature", "fuel_price", "cpi", "unemployment",
              "total_markdown", "has_markdown", "store_type", "store_size"]

df[sales_cols].to_parquet(DATA_DIR / "sales_features.parquet", index=False)
df[store_cols].to_parquet(DATA_DIR / "store_features.parquet", index=False)
df.to_parquet(DATA_DIR / "features.parquet", index=False)

!ls -lh {DATA_DIR}/*.parquet

## Feast Apply


In [None]:
%%bash -s "$FEATURE_REPO"
cd $1 && feast apply

## Feast Materialize


In [None]:
import subprocess
subprocess.run([
    "feast", "materialize",
    parameters['materialize']['start'],
    parameters['materialize']['end']
], cwd=str(FEATURE_REPO), check=True)


## Query Features


In [None]:
from feast import FeatureStore

store = FeatureStore(repo_path=str(FEATURE_REPO))
store.list_feature_views()


In [None]:
# Online features (low-latency serving)
online = store.get_online_features(
    features=["sales_features:weekly_sales", "sales_features:sales_lag_1", "store_features:temperature"],
    entity_rows=[{"store": 1, "dept": 1}]
).to_dict()
online


In [None]:
# Historical features (training data)
entity_df = pd.DataFrame({
    "store": [1, 1, 2],
    "dept": [1, 2, 1],
    "event_timestamp": pd.to_datetime(["2012-06-01", "2012-06-01", "2012-06-01"], utc=True)
})

historical = store.get_historical_features(
    entity_df=entity_df,
    features=["sales_features:weekly_sales", "sales_features:sales_lag_1", "store_features:store_size"]
).to_df()
historical


## Cleanup


In [None]:
# Uncomment to teardown:
# !cd {FEATURE_REPO} && feast teardown
