In [1]:
import pandas as pd, numpy as np, holidays, pathlib
from sklearn.preprocessing import StandardScaler
DATA_DIR = pathlib.Path("../data")
RAW = DATA_DIR / "raw" / "sales_data.csv"
OUT = DATA_DIR / "processed" / "sales_clean.parquet"

# ----------------- ingest & basic tidy -----------------
df = (
    pd.read_csv(RAW, parse_dates=["Date"])
      .drop_duplicates()
      .sort_values("Date")
      .reset_index(drop=True)
)

# correct any obvious datatype inconsistencies (example)
cat_cols = ["Month","Age_Group","Customer_Gender",
            "Country","State","Product_Category","Sub_Category","Product"]
df[cat_cols] = df[cat_cols].astype("category")

# ----------------- simple outlier cap (IQR) -------------
num_cols = df.select_dtypes("number").columns
for c in num_cols:
    q1, q3 = np.percentile(df[c], [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    df[c] = np.clip(df[c], lo, hi)

# ----------------- date-derived features ----------------
df["dow"]    = df["Date"].dt.dayofweek          # 0=Mon
df["week"]   = df["Date"].dt.isocalendar().week
df["yearmo"] = df["Date"].dt.to_period("M").astype(str)

# holiday flag (US example – extend list later)
us_holidays = holidays.country_holidays("US")
df["is_holiday_us"] = df["Date"].isin(us_holidays)

# ----------------- save & register ----------------------
df.to_parquet(OUT, index=False)
