In [2]:
# Instacart Customer Segmentation - Feature Engineering
# Notebook 02: Creating User-Level Features for Clustering
# Optimized for GitHub Codespaces + Kaggle Hub

from __future__ import annotations

import gc
import json
import pickle
import sys
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings("ignore")

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("=" * 60)
print("FEATURE ENGINEERING FOR CUSTOMER SEGMENTATION")
print("=" * 60)

# =============================================================================
# 0. SETUP PATHS
# =============================================================================
project_root = Path.cwd()
if project_root.name == "notebooks":
    project_root = project_root.parent

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"\nüìÅ Path Configuration:")
print(f"  cwd: {Path.cwd()}")
print(f"  project_root: {project_root}")
print(f"  src exists: {(project_root / 'src').exists()}")

# Create artifact directories
artifacts_dir = project_root / "artifacts"
features_dir = artifacts_dir / "features"
figures_dir = artifacts_dir / "figures"
models_dir = artifacts_dir / "models"
reports_dir = artifacts_dir / "reports"

for d in [features_dir, figures_dir, models_dir, reports_dir]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\nüì¶ Output directories:")
print(f"  features: {features_dir}")
print(f"  figures: {figures_dir}")
print(f"  models: {models_dir}")
print(f"  reports: {reports_dir}")

# =============================================================================
# 1. LOAD DATA VIA KAGGLE HUB
# =============================================================================
print("\n[1] Loading datasets via kagglehub...")

from src.data.kaggle_download import download_instacart

data_dir = Path(download_instacart())
print(f"  data_dir: {data_dir}")

def read_csv(name: str, usecols: list[str] | None = None, dtype: dict | None = None) -> pd.DataFrame:
    """Helper to read CSV with error handling"""
    path = data_dir / name
    if not path.exists():
        raise FileNotFoundError(f"Missing file: {path}")
    return pd.read_csv(path, usecols=usecols, dtype=dtype)

# Define dtypes for memory efficiency
dtype_orders = {
    "order_id": np.int32,
    "user_id": np.int32,
    "eval_set": "category",
    "order_number": np.int16,
    "order_dow": np.int8,
    "order_hour_of_day": np.int8,
    "days_since_prior_order": np.float32,
}

dtype_order_products = {
    "order_id": np.int32,
    "product_id": np.int32,
    "add_to_cart_order": np.int16,
    "reordered": np.int8,
}

dtype_products = {
    "product_id": np.int32,
    "aisle_id": np.int16,
    "department_id": np.int16,
}

# Load main datasets
orders = read_csv("orders.csv", dtype=dtype_orders)
products = read_csv("products.csv", dtype=dtype_products)
aisles = read_csv("aisles.csv", dtype={"aisle_id": np.int16})
departments = read_csv("departments.csv", dtype={"department_id": np.int16})

print(f"\n‚úì Loaded core datasets:")
print(f"  Orders: {orders.shape}")
print(f"  Products: {products.shape}")
print(f"  Aisles: {aisles.shape}")
print(f"  Departments: {departments.shape}")

# =============================================================================
# 2. LOAD QUALIFIED USERS & FILTER ORDERS
# =============================================================================
print("\n[2] Loading qualified users and filtering...")

# Load qualified users from previous notebook
qualified_users_path = features_dir / "qualified_users.csv"
if not qualified_users_path.exists():
    raise FileNotFoundError(
        f"Qualified users not found at {qualified_users_path}. "
        "Please run 01_data_exploration.ipynb first!"
    )

qualified_users = pd.read_csv(qualified_users_path, dtype={"user_id": np.int32})
print(f"  Qualified users: {len(qualified_users):,}")

# Filter orders to qualified users only (EARLY FILTERING = MEMORY SAVINGS)
orders_f = orders[orders["user_id"].isin(qualified_users["user_id"])].copy()
print(f"  Filtered orders: {len(orders_f):,} (from {len(orders):,})")

# Free up memory
del orders
gc.collect()

# =============================================================================
# 3. LOAD ORDER PRODUCTS (CHUNKED & FILTERED)
# =============================================================================
print("\n[3] Loading order products (memory-optimized)...")

# Get order IDs we care about
relevant_order_ids = set(orders_f["order_id"].values)
print(f"  Relevant order IDs: {len(relevant_order_ids):,}")

# Load prior orders in chunks
print("  Loading order_products__prior.csv in chunks...")
chunks = []
chunk_size = 5_000_000

for i, chunk in enumerate(pd.read_csv(
    data_dir / "order_products__prior.csv",
    dtype=dtype_order_products,
    chunksize=chunk_size
)):
    # Filter to relevant orders only
    chunk_filtered = chunk[chunk["order_id"].isin(relevant_order_ids)].copy()
    chunks.append(chunk_filtered)
    
    if (i + 1) % 5 == 0:
        print(f"    Processed {(i+1) * chunk_size:,} rows...")
    
    del chunk
    gc.collect()

order_products_prior = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f"  ‚úì Prior products: {order_products_prior.shape}")

# Load train orders (smaller file)
print("  Loading order_products__train.csv...")
order_products_train = read_csv("order_products__train.csv", dtype=dtype_order_products)
order_products_train = order_products_train[
    order_products_train["order_id"].isin(relevant_order_ids)
].copy()
print(f"  ‚úì Train products: {order_products_train.shape}")

# Combine prior + train
op = pd.concat([order_products_prior, order_products_train], ignore_index=True)
del order_products_prior, order_products_train
gc.collect()

print(f"\n‚úì Total order-product pairs: {len(op):,}")

# Add user_id to order_products for faster aggregation
print("  Merging user_id into order_products...")
op = op.merge(orders_f[["order_id", "user_id"]], on="order_id", how="left")
print(f"  ‚úì Order-products with user_id: {op.shape}")

# =============================================================================
# 4. CREATE PRODUCT CATALOG
# =============================================================================
print("\n[4] Creating product catalog...")

products_full = (
    products
    .merge(aisles, on="aisle_id", how="left")
    .merge(departments, on="department_id", how="left")
)

print(f"  ‚úì Products catalog: {products_full.shape}")

# =============================================================================
# 5. INITIALIZE FEATURE DATAFRAME
# =============================================================================
print("\n[5] Initializing feature engineering...")

user_features = pd.DataFrame({"user_id": qualified_users["user_id"]})
print(f"  Feature matrix initialized: {user_features.shape}")

# =============================================================================
# 6. PRODUCT PREFERENCE FEATURES (OPTIMIZED - MINIMAL SET)
# =============================================================================
print("\n[6] Engineering Product Preference Features...")
print("-" * 60)

# Pre-merge product info with op ONCE (huge speed boost!)
print("  ‚Üí Merging product catalog with order products...")
op_with_products = op.merge(
    products_full[["product_id", "aisle", "department_id", "product_name"]],
    on="product_id",
    how="left"
)
print(f"    Merged shape: {op_with_products.shape}")

# 6.1 Top 10 Aisles Only (reduced from 30 for speed)
print("\n  ‚Üí Computing top 10 aisle purchase ratios...")

# Find top 10 most popular aisles by product count
top_aisles = (
    op_with_products["aisle"]
    .value_counts()
    .head(10)
    .index
    .tolist()
)
print(f"    Top 10 aisles: {', '.join(top_aisles[:3])}...")

# Filter to top aisles and aggregate
op_top_aisles = op_with_products[op_with_products["aisle"].isin(top_aisles)].copy()

# Count orders per user per aisle (faster with filtered data)
user_aisle_counts = (
    op_top_aisles.groupby(["user_id", "aisle"])["order_id"]
    .nunique()
    .reset_index(name="orders_with_aisle")
)

# Total orders per user
user_total_orders = (
    orders_f.groupby("user_id")["order_id"]
    .nunique()
    .reset_index(name="total_orders")
)

# Calculate ratios
user_aisle_counts = user_aisle_counts.merge(user_total_orders, on="user_id")
user_aisle_counts["aisle_ratio"] = (
    user_aisle_counts["orders_with_aisle"] / user_aisle_counts["total_orders"]
).astype(np.float32)

# Pivot to wide format
aisle_features = user_aisle_counts.pivot(
    index="user_id",
    columns="aisle",
    values="aisle_ratio"
).fillna(0).reset_index()

# Rename columns
aisle_features.columns = ["user_id"] + [
    f"aisle_{col.lower().replace(' ', '_').replace('&', 'and')}" 
    for col in aisle_features.columns[1:]
]

# Merge with main features
user_features = user_features.merge(aisle_features, on="user_id", how="left")

del user_aisle_counts, aisle_features, user_total_orders, op_top_aisles
gc.collect()

print(f"  ‚úì Created {len([c for c in user_features.columns if c.startswith('aisle_')])} aisle features")

# 6.2 Department Diversity (already fast - keep as is)
print("\n  ‚Üí Computing department diversity...")

user_dept_diversity = (
    op_with_products.groupby("user_id")["department_id"]
    .nunique()
    .reset_index(name="dept_diversity")
    .astype({"dept_diversity": np.int8})
)

user_features = user_features.merge(user_dept_diversity, on="user_id", how="left")
del user_dept_diversity
gc.collect()

# 6.3 Organic Preference (ULTRA-OPTIMIZED!)
print("\n  ‚Üí Computing organic product preference...")

# Use already-merged data and vectorized operation
op_with_products["is_organic"] = (
    op_with_products["product_name"]
    .str.contains("organic", case=False, na=False)
    .astype(np.int8)
)

# Direct aggregation on already-merged data
user_organic = (
    op_with_products.groupby("user_id")["is_organic"]
    .mean()
    .reset_index(name="organic_ratio")
    .astype({"organic_ratio": np.float32})
)

user_features = user_features.merge(user_organic, on="user_id", how="left")

# 6.4 Fresh vs Packaged Ratio (bonus feature using same merged data)
print("\n  ‚Üí Computing fresh food preference...")

op_with_products["is_fresh"] = (
    op_with_products["aisle"]
    .str.contains("fresh", case=False, na=False)
    .astype(np.int8)
)

user_fresh = (
    op_with_products.groupby("user_id")["is_fresh"]
    .mean()
    .reset_index(name="fresh_ratio")
    .astype({"fresh_ratio": np.float32})
)

user_features = user_features.merge(user_fresh, on="user_id", how="left")

del user_organic, user_fresh, op_with_products
gc.collect()

print(f"  ‚úì Product preference features: {user_features.shape}")

# =============================================================================
# 7. SHOPPING HABIT FEATURES
# =============================================================================
print("\n[7] Engineering Shopping Habit Features...")
print("-" * 60)

# 7.1 Order Frequency
print("\n  ‚Üí Computing order frequency metrics...")

user_order_freq = (
    orders_f[orders_f["days_since_prior_order"].notna()]
    .groupby("user_id")["days_since_prior_order"]
    .agg([
        ("avg_days_between_orders", "mean"),
        ("std_days_between_orders", "std"),
        ("cv_order_frequency", lambda x: x.std() / x.mean() if x.mean() > 0 else 0)
    ])
    .reset_index()
    .astype({
        "avg_days_between_orders": np.float32,
        "std_days_between_orders": np.float32,
        "cv_order_frequency": np.float32
    })
)

user_features = user_features.merge(user_order_freq, on="user_id", how="left")

# 7.2 Basket Size
print("\n  ‚Üí Computing basket size metrics...")

basket_size = (
    op.groupby("order_id")
    .size()
    .reset_index(name="basket_size")
    .astype({"basket_size": np.int16})
)

user_basket = (
    orders_f.merge(basket_size, on="order_id", how="left")
    .groupby("user_id")["basket_size"]
    .agg([
        ("avg_basket_size", "mean"),
        ("std_basket_size", "std"),
        ("cv_basket_size", lambda x: x.std() / x.mean() if x.mean() > 0 else 0)
    ])
    .reset_index()
    .astype({
        "avg_basket_size": np.float32,
        "std_basket_size": np.float32,
        "cv_basket_size": np.float32
    })
)

user_features = user_features.merge(user_basket, on="user_id", how="left")

del basket_size, user_order_freq, user_basket
gc.collect()

# 7.3 Total Orders
print("\n  ‚Üí Computing total order count...")

user_order_count = (
    orders_f.groupby("user_id")["order_id"]
    .count()
    .reset_index(name="total_orders")
    .astype({"total_orders": np.int16})
)

user_features = user_features.merge(user_order_count, on="user_id", how="left")

del user_order_count
gc.collect()

print(f"  ‚úì Shopping habit features: {user_features.shape}")

# =============================================================================
# 8. TEMPORAL PATTERN FEATURES
# =============================================================================
print("\n[8] Engineering Temporal Pattern Features...")
print("-" * 60)

# 8.1 Preferred Hour
print("\n  ‚Üí Computing hour preferences...")

user_hour = (
    orders_f.groupby("user_id")["order_hour_of_day"]
    .agg([
        ("avg_order_hour", "mean"),
        ("std_order_hour", "std")
    ])
    .reset_index()
    .astype({
        "avg_order_hour": np.float32,
        "std_order_hour": np.float32
    })
)

user_features = user_features.merge(user_hour, on="user_id", how="left")

# 8.2 Weekend Preference
print("\n  ‚Üí Computing weekend preference...")

orders_f["is_weekend"] = orders_f["order_dow"].isin([0, 6]).astype(np.int8)

user_weekend = (
    orders_f.groupby("user_id")["is_weekend"]
    .mean()
    .reset_index(name="weekend_ratio")
    .astype({"weekend_ratio": np.float32})
)

user_features = user_features.merge(user_weekend, on="user_id", how="left")

# 8.3 Day Diversity
print("\n  ‚Üí Computing day diversity...")

user_dow_diversity = (
    orders_f.groupby("user_id")["order_dow"]
    .nunique()
    .reset_index(name="dow_diversity")
    .astype({"dow_diversity": np.int8})
)

user_features = user_features.merge(user_dow_diversity, on="user_id", how="left")

# 8.4 Temporal Entropy
print("\n  ‚Üí Computing temporal consistency...")

def temporal_entropy(hours):
    """Calculate entropy of ordering hours"""
    if len(hours) < 2:
        return 0.0
    hour_dist = pd.Series(hours).value_counts(normalize=True)
    return float(stats.entropy(hour_dist))

user_time_entropy = (
    orders_f.groupby("user_id")["order_hour_of_day"]
    .apply(temporal_entropy)
    .reset_index(name="time_entropy")
    .astype({"time_entropy": np.float32})
)

user_features = user_features.merge(user_time_entropy, on="user_id", how="left")

del user_hour, user_weekend, user_dow_diversity, user_time_entropy
gc.collect()

print(f"  ‚úì Temporal features: {user_features.shape}")

# NOW we can delete orders_f (no longer needed)
del orders_f
gc.collect()

# =============================================================================
# 9. LOYALTY INDICATOR FEATURES
# =============================================================================
print("\n[9] Engineering Loyalty Indicator Features...")
print("-" * 60)

# 9.1 Reorder Ratio
print("\n  ‚Üí Computing reorder ratios...")

user_reorder = (
    op.groupby("user_id")["reordered"]
    .mean()
    .reset_index(name="reorder_ratio")
    .astype({"reorder_ratio": np.float32})
)

user_features = user_features.merge(user_reorder, on="user_id", how="left")

# 9.2 Product Variety
print("\n  ‚Üí Computing product variety...")

user_product_variety = (
    op.groupby("user_id")["product_id"]
    .nunique()
    .reset_index(name="unique_products")
    .astype({"unique_products": np.int16})
)

user_features = user_features.merge(user_product_variety, on="user_id", how="left")

# 9.3 Exploration Metric
print("\n  ‚Üí Computing exploration score...")

user_features["products_per_order"] = (
    user_features["unique_products"] / user_features["total_orders"]
).astype(np.float32)

# 9.4 Repeat Purchase Rate
print("\n  ‚Üí Computing repeat purchase metrics...")

total_product_instances = (
    op.groupby("user_id")["product_id"]
    .count()
    .reset_index(name="total_product_instances")
    .astype({"total_product_instances": np.int32})
)

user_features = user_features.merge(total_product_instances, on="user_id", how="left")

user_features["repeat_purchase_rate"] = (
    (user_features["total_product_instances"] - user_features["unique_products"]) /
    user_features["total_product_instances"]
).astype(np.float32)

del user_reorder, user_product_variety, total_product_instances
gc.collect()

print(f"  ‚úì Loyalty features: {user_features.shape}")

# Cleanup big dataframes (keep orders_f for now, we need it!)
del op
gc.collect()

# =============================================================================
# 10. HANDLE MISSING VALUES
# =============================================================================
print("\n[10] Handling Missing Values...")
print("-" * 60)

missing_counts = user_features.isnull().sum()
missing_features = missing_counts[missing_counts > 0]

if len(missing_features) > 0:
    print(f"  ‚ö†Ô∏è  Features with missing values: {len(missing_features)}")
    for feat, count in missing_features.items():
        print(f"    {feat}: {count}")
    print("  ‚Üí Filling with 0...")
    user_features = user_features.fillna(0)
else:
    print("  ‚úì No missing values!")

# =============================================================================
# 11. FEATURE SCALING
# =============================================================================
print("\n[11] Feature Scaling (Critical for Clustering)...")
print("-" * 60)

feature_cols = [col for col in user_features.columns if col != "user_id"]
print(f"\n  Features to scale: {len(feature_cols)}")

# Create scaled version
user_features_scaled = user_features[["user_id"]].copy()

# Use RobustScaler (resistant to outliers)
print("\n  ‚Üí Applying RobustScaler...")
scaler = RobustScaler()

scaled_values = scaler.fit_transform(user_features[feature_cols])
scaled_df = pd.DataFrame(
    scaled_values,
    columns=feature_cols,
    index=user_features.index
)

user_features_scaled = pd.concat([user_features_scaled, scaled_df], axis=1)

# Verify scaling
print("\n  ‚úì Scaling verification:")
print(f"    Original mean range: [{user_features[feature_cols].mean().min():.4f}, "
      f"{user_features[feature_cols].mean().max():.4f}]")
print(f"    Scaled mean range: [{user_features_scaled[feature_cols].mean().min():.4f}, "
      f"{user_features_scaled[feature_cols].mean().max():.4f}]")
print(f"    Original std range: [{user_features[feature_cols].std().min():.4f}, "
      f"{user_features[feature_cols].std().max():.4f}]")
print(f"    Scaled std range: [{user_features_scaled[feature_cols].std().min():.4f}, "
      f"{user_features_scaled[feature_cols].std().max():.4f}]")

# Save scaler
scaler_path = models_dir / "feature_scaler.pkl"
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)
print(f"\n  ‚úì Scaler saved to {scaler_path}")

# =============================================================================
# 12. FEATURE VARIANCE ANALYSIS
# =============================================================================
print("\n[12] Feature Variance Analysis...")
print("-" * 60)

feature_variance = user_features_scaled[feature_cols].var().sort_values(ascending=False)

print("\n  Top 10 highest variance features:")
for i, (feat, var) in enumerate(feature_variance.head(10).items(), 1):
    print(f"    {i}. {feat}: {var:.4f}")

print("\n  Bottom 10 lowest variance features:")
for i, (feat, var) in enumerate(feature_variance.tail(10).items(), 1):
    print(f"    {i}. {feat}: {var:.4f}")

# Flag low-variance features
low_var_threshold = 0.01
low_var_features = feature_variance[feature_variance < low_var_threshold].index.tolist()

if len(low_var_features) > 0:
    print(f"\n  ‚ö†Ô∏è  {len(low_var_features)} features have very low variance (< {low_var_threshold})")
else:
    print("\n  ‚úì All features have sufficient variance")

# =============================================================================
# 13. SAVE FEATURES
# =============================================================================
print("\n[13] Saving Features...")
print("-" * 60)

# Save unscaled
unscaled_path = features_dir / "user_features.csv"
user_features.to_csv(unscaled_path, index=False)
print(f"  ‚úì Unscaled features: {unscaled_path.name} {user_features.shape}")

# Save scaled (FOR CLUSTERING!)
scaled_path = features_dir / "user_features_scaled.csv"
user_features_scaled.to_csv(scaled_path, index=False)
print(f"  ‚úì Scaled features: {scaled_path.name} {user_features_scaled.shape}")

# Save feature metadata
feature_metadata = pd.DataFrame({
    "feature_name": feature_cols,
    "variance_scaled": [feature_variance.get(f, 0) for f in feature_cols],
    "mean_original": user_features[feature_cols].mean().values,
    "std_original": user_features[feature_cols].std().values,
    "mean_scaled": user_features_scaled[feature_cols].mean().values,
    "std_scaled": user_features_scaled[feature_cols].std().values,
})

metadata_path = features_dir / "feature_metadata.csv"
feature_metadata.to_csv(metadata_path, index=False)
print(f"  ‚úì Feature metadata: {metadata_path.name}")

# Save feature names
feature_names_path = features_dir / "feature_names.csv"
pd.DataFrame({"feature_name": feature_cols}).to_csv(feature_names_path, index=False)
print(f"  ‚úì Feature names: {feature_names_path.name}")

# =============================================================================
# 14. CREATE FEATURE DESCRIPTION TABLE
# =============================================================================
print("\n[14] Creating Feature Description Table...")
print("-" * 60)

aisle_cols = [col for col in feature_cols if col.startswith("aisle_")]
other_cols = [col for col in feature_cols if not col.startswith("aisle_")]

feature_categories = {
    "aisle_features": aisle_cols,
    "shopping_features": [
        "avg_days_between_orders", "std_days_between_orders", "cv_order_frequency",
        "avg_basket_size", "std_basket_size", "cv_basket_size", "total_orders"
    ],
    "temporal_features": [
        "avg_order_hour", "std_order_hour", "weekend_ratio",
        "dow_diversity", "time_entropy"
    ],
    "loyalty_features": [
        "reorder_ratio", "unique_products", "products_per_order",
        "total_product_instances", "repeat_purchase_rate"
    ],
    "other_features": ["dept_diversity", "organic_ratio"]
}

categories_path = features_dir / "feature_categories.json"
with open(categories_path, "w") as f:
    json.dump(feature_categories, f, indent=2)
print(f"  ‚úì Feature categories: {categories_path.name}")

# Print summary
print(f"\nüì¶ Feature Categories:")
for cat_name, cat_features in feature_categories.items():
    print(f"  {cat_name}: {len(cat_features)}")
print(f"  Total: {len(feature_cols)}")

# =============================================================================
# 15. VISUALIZATIONS
# =============================================================================
print("\n[15] Creating Visualizations...")
print("-" * 60)

# Select key features to visualize
key_features = [
    "avg_days_between_orders",
    "avg_basket_size",
    "dept_diversity",
    "reorder_ratio",
    "organic_ratio",
    "weekend_ratio",
    "avg_order_hour",
    "unique_products"
]

# Feature distributions
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    if feature in user_features.columns:
        axes[idx].hist(
            user_features[feature],
            bins=50,
            edgecolor="black",
            alpha=0.7,
            color="steelblue"
        )
        axes[idx].set_title(
            feature.replace("_", " ").title(),
            fontsize=10,
            fontweight="bold"
        )
        axes[idx].set_xlabel("Value")
        axes[idx].set_ylabel("Frequency")
        axes[idx].grid(axis="y", alpha=0.3)

        mean_val = user_features[feature].mean()
        axes[idx].axvline(
            mean_val,
            color="red",
            linestyle="--",
            label=f"Mean: {mean_val:.2f}",
            linewidth=2
        )
        axes[idx].legend(fontsize=8)

plt.tight_layout()
dist_path = figures_dir / "feature_distributions.png"
plt.savefig(dist_path, dpi=300, bbox_inches="tight")
print(f"  ‚úì Saved: {dist_path.name}")
plt.close()

# Correlation heatmap
corr_matrix = user_features[key_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=1,
    cbar_kws={"shrink": 0.8}
)
plt.title(
    "Feature Correlation Matrix (Key Features)",
    fontsize=14,
    fontweight="bold",
    pad=20
)
plt.tight_layout()
corr_path = figures_dir / "feature_correlations.png"
plt.savefig(corr_path, dpi=300, bbox_inches="tight")
print(f"  ‚úì Saved: {corr_path.name}")
plt.close()

# =============================================================================
# 16. SUMMARY REPORT
# =============================================================================
print("\n[16] Creating Summary Report...")
print("-" * 60)

summary_report = {
    "total_users": len(user_features),
    "total_features": len(feature_cols),
    "aisle_features": len(aisle_cols),
    "shopping_features": len(feature_categories["shopping_features"]),
    "temporal_features": len(feature_categories["temporal_features"]),
    "loyalty_features": len(feature_categories["loyalty_features"]),
    "other_features": len(feature_categories["other_features"]),
    "missing_values": 0,
    "scaling_method": "RobustScaler",
    "date_created": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
}

summary_path = reports_dir / "feature_engineering_summary.csv"
pd.DataFrame([summary_report]).T.to_csv(summary_path, header=["value"])
print(f"  ‚úì Summary report: {summary_path}")

# =============================================================================
# COMPLETE!
# =============================================================================
print("\n" + "=" * 60)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("=" * 60)
print(f"\nüìä Summary:")
print(f"  Total users: {len(user_features):,}")
print(f"  Total features: {len(feature_cols)}")
print(f"  Memory usage: {user_features_scaled.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nüìÅ Output Files:")
print(f"  {unscaled_path}")
print(f"  {scaled_path} ‚Üê USE THIS FOR CLUSTERING")
print(f"  {metadata_path}")
print(f"  {scaler_path}")
print(f"\n‚úÖ Features are properly scaled and ready for clustering!")

FEATURE ENGINEERING FOR CUSTOMER SEGMENTATION

üìÅ Path Configuration:
  cwd: /workspaces/instacart-customer-clustering/notebooks
  project_root: /workspaces/instacart-customer-clustering
  src exists: True

üì¶ Output directories:
  features: /workspaces/instacart-customer-clustering/artifacts/features
  figures: /workspaces/instacart-customer-clustering/artifacts/figures
  models: /workspaces/instacart-customer-clustering/artifacts/models
  reports: /workspaces/instacart-customer-clustering/artifacts/reports

[1] Loading datasets via kagglehub...
  data_dir: /home/codespace/.cache/kagglehub/datasets/psparks/instacart-market-basket-analysis/versions/1

‚úì Loaded core datasets:
  Orders: (3421083, 7)
  Products: (49688, 4)
  Aisles: (134, 2)
  Departments: (21, 2)

[2] Loading qualified users and filtering...
  Qualified users: 182,223
  Filtered orders: 3,325,139 (from 3,421,083)

[3] Loading order products (memory-optimized)...
  Relevant order IDs: 3,325,139
  Loading order_produ