# Stage 09: Feature Engineering

## Overview
This notebook demonstrates the feature engineering process for bike demand prediction, creating meaningful features to improve model performance based on domain knowledge and EDA insights.

## Feature Engineering Strategy
Based on our EDA findings, we'll create features in the following categories:
1. **Temporal Features**: Hour patterns, weekend effects, cyclical time
2. **Weather Features**: Temperature-humidity interactions, comfort indices
3. **Derived Features**: Rolling averages, lag features, rate of change
4. **Interaction Features**: Complex relationships between variables

In [None]:
# Import libraries and feature engineering module
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from feature_engineering import (
    feature_engineering_pipeline,
    create_temporal_features,
    create_weather_features,
    create_derived_features,
    get_feature_importance_summary
)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("🔧 Feature Engineering Libraries Loaded")

In [None]:
# Load the original dataset
data = pd.read_csv('../data/sample-data.csv')

print("📊 Original Dataset:")
print(f"Shape: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print("\nFirst 5 rows:")
data.head()

## 1. Temporal Feature Engineering

### Rationale:
- **Hour Categories**: Bike demand follows daily patterns (morning commute, lunch, evening)
- **Weekend Indicator**: Usage patterns differ between weekdays and weekends
- **Cyclical Encoding**: Hour is cyclical (23 is close to 0), sin/cos captures this
- **Rush Hours**: Peak demand during commuting hours (7-9 AM, 5-7 PM)

In [None]:
# Create temporal features
data_temporal = create_temporal_features(data)

print("⏰ Temporal Features Created:")
temporal_features = ['hour_category', 'is_weekend', 'hour_sin', 'hour_cos', 
                    'is_morning_rush', 'is_evening_rush', 'is_work_hours']

for feature in temporal_features:
    print(f"   • {feature}: {data_temporal[feature].dtype}")

# Visualize temporal patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Hour categories vs demand
sns.boxplot(data=data_temporal, x='hour_category', y='demand', ax=axes[0,0])
axes[0,0].set_title('Demand by Hour Category')
axes[0,0].tick_params(axis='x', rotation=45)

# Weekend vs weekday demand
sns.boxplot(data=data_temporal, x='is_weekend', y='demand', ax=axes[0,1])
axes[0,1].set_title('Demand: Weekday (0) vs Weekend (1)')

# Cyclical hour encoding
scatter = axes[1,0].scatter(data_temporal['hour_sin'], data_temporal['hour_cos'], 
                          c=data_temporal['demand'], cmap='viridis')
axes[1,0].set_title('Cyclical Hour Encoding (colored by demand)')
axes[1,0].set_xlabel('Hour Sin')
axes[1,0].set_ylabel('Hour Cos')
plt.colorbar(scatter, ax=axes[1,0])

# Rush hour effects
rush_data = data_temporal.groupby('hour')['demand'].mean().reset_index()
axes[1,1].plot(rush_data['hour'], rush_data['demand'], marker='o')
axes[1,1].axvspan(7, 9, alpha=0.3, color='red', label='Morning Rush')
axes[1,1].axvspan(17, 19, alpha=0.3, color='blue', label='Evening Rush')
axes[1,1].set_title('Demand by Hour with Rush Hour Highlights')
axes[1,1].set_xlabel('Hour')
axes[1,1].set_ylabel('Average Demand')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## 2. Weather Feature Engineering

### Rationale:
- **Temperature-Humidity Interaction**: Combined effect on comfort
- **Weather Comfort Index**: Optimal biking conditions around 20-25°C, 40-60% humidity
- **Temperature Categories**: Non-linear relationship with demand
- **Weather Extremes**: Very hot/cold or humid conditions deter biking

In [None]:
# Create weather features
data_weather = create_weather_features(data_temporal)

print("🌡️ Weather Features Created:")
weather_features = ['temp_humidity_interaction', 'weather_comfort_index', 
                   'temperature_category', 'humidity_category', 
                   'is_temp_extreme', 'is_humidity_extreme', 'is_ideal_weather']

for feature in weather_features:
    if feature in data_weather.columns:
        print(f"   • {feature}: {data_weather[feature].dtype}")

# Visualize weather features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Weather comfort index vs demand
axes[0,0].scatter(data_weather['weather_comfort_index'], data_weather['demand'], alpha=0.7)
axes[0,0].set_title('Demand vs Weather Comfort Index')
axes[0,0].set_xlabel('Weather Comfort Index')
axes[0,0].set_ylabel('Demand')

# Temperature categories vs demand
sns.boxplot(data=data_weather, x='temperature_category', y='demand', ax=axes[0,1])
axes[0,1].set_title('Demand by Temperature Category')
axes[0,1].tick_params(axis='x', rotation=45)

# Humidity categories vs demand
sns.boxplot(data=data_weather, x='humidity_category', y='demand', ax=axes[1,0])
axes[1,0].set_title('Demand by Humidity Category')

# Ideal weather conditions
sns.boxplot(data=data_weather, x='is_ideal_weather', y='demand', ax=axes[1,1])
axes[1,1].set_title('Demand: Non-ideal (0) vs Ideal Weather (1)')

plt.tight_layout()
plt.show()

# Print weather comfort statistics
print("\n📊 Weather Comfort Analysis:")
comfort_stats = data_weather.groupby('is_ideal_weather')['demand'].agg(['mean', 'std', 'count'])
print(comfort_stats)

## 3. Derived Features (Time Series)

### Rationale:
- **Rolling Averages**: Smooth out noise, capture trends
- **Lag Features**: Previous demand influences current demand
- **Rate of Change**: Momentum in weather and demand patterns
- **Moving Statistics**: Capture local patterns and variability

In [None]:
# Create derived features
data_derived = create_derived_features(data_weather)

print("📈 Derived Features Created:")
derived_features = ['demand_rolling_3h', 'demand_rolling_std_3h', 'demand_rolling_6h',
                   'demand_lag_1', 'demand_lag_2', 'temperature_change', 
                   'humidity_change', 'demand_change', 'demand_momentum']

for feature in derived_features:
    if feature in data_derived.columns:
        print(f"   • {feature}: {data_derived[feature].dtype}")

# Visualize derived features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Demand vs rolling average
axes[0,0].plot(data_derived.index, data_derived['demand'], label='Actual Demand', alpha=0.7)
axes[0,0].plot(data_derived.index, data_derived['demand_rolling_3h'], 
               label='3-Hour Rolling Average', linewidth=2)
axes[0,0].set_title('Demand vs Rolling Average')
axes[0,0].set_xlabel('Time Index')
axes[0,0].set_ylabel('Demand')
axes[0,0].legend()

# Lag feature correlation
axes[0,1].scatter(data_derived['demand_lag_1'], data_derived['demand'], alpha=0.7)
axes[0,1].set_title('Current vs Previous Hour Demand')
axes[0,1].set_xlabel('Previous Hour Demand')
axes[0,1].set_ylabel('Current Demand')

# Temperature change vs demand change
axes[1,0].scatter(data_derived['temperature_change'], data_derived['demand_change'], alpha=0.7)
axes[1,0].set_title('Temperature Change vs Demand Change')
axes[1,0].set_xlabel('Temperature Change')
axes[1,0].set_ylabel('Demand Change')

# Demand momentum
axes[1,1].plot(data_derived.index, data_derived['demand_momentum'], alpha=0.7)
axes[1,1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1,1].set_title('Demand Momentum (Acceleration)')
axes[1,1].set_xlabel('Time Index')
axes[1,1].set_ylabel('Demand Momentum')

plt.tight_layout()
plt.show()

## 4. Complete Feature Engineering Pipeline

Apply all feature engineering steps using the pipeline function.

In [None]:
# Apply complete feature engineering pipeline
engineered_data = feature_engineering_pipeline(data)

print(f"\n📊 Feature Engineering Summary:")
print(f"   Original features: {data.shape[1]}")
print(f"   Engineered features: {engineered_data.shape[1] - data.shape[1]}")
print(f"   Total features: {engineered_data.shape[1]}")
print(f"   Data points: {engineered_data.shape[0]}")

# Display feature categories
print("\n🏷️ Feature Categories:")
all_features = engineered_data.columns.tolist()

original_features = ['hour', 'temperature', 'humidity', 'day_of_week', 'demand']
temporal_features = [f for f in all_features if any(x in f for x in ['hour_', 'weekend', 'rush', 'work'])]
weather_features = [f for f in all_features if any(x in f for x in ['temp_', 'humidity_', 'weather_', 'ideal'])]
derived_features = [f for f in all_features if any(x in f for x in ['rolling', 'lag', 'change', 'momentum'])]
interaction_features = [f for f in all_features if 'interaction' in f or 'combo' in f]

print(f"   • Original: {len(original_features)} features")
print(f"   • Temporal: {len(temporal_features)} features")
print(f"   • Weather: {len(weather_features)} features")
print(f"   • Derived: {len(derived_features)} features")
print(f"   • Interaction: {len(interaction_features)} features")
print(f"   • One-hot encoded: {engineered_data.shape[1] - len(original_features) - len(temporal_features) - len(weather_features) - len(derived_features) - len(interaction_features)} features")

## 5. Feature Importance Analysis

In [None]:
# Calculate feature importance based on correlation with target
importance_df = get_feature_importance_summary(engineered_data)

print("🏆 Top 15 Most Important Features:")
print(importance_df.head(15))

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = importance_df.head(20)
plt.barh(range(len(top_features)), top_features['abs_correlation'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Absolute Correlation with Demand')
plt.title('Top 20 Feature Importance (by Correlation)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Feature importance by category
print("\n📊 Feature Importance by Category:")
for feature in importance_df.head(10)['feature']:
    if any(x in feature for x in ['hour_', 'weekend', 'rush', 'work']):
        category = 'Temporal'
    elif any(x in feature for x in ['temp', 'humidity', 'weather', 'ideal']):
        category = 'Weather'
    elif any(x in feature for x in ['rolling', 'lag', 'change', 'momentum']):
        category = 'Derived'
    elif 'interaction' in feature:
        category = 'Interaction'
    else:
        category = 'Original/Other'
    
    corr = importance_df[importance_df['feature'] == feature]['abs_correlation'].iloc[0]
    print(f"   {category:12} | {feature:25} | {corr:.3f}")

## 6. Feature Engineering Insights and Validation

In [None]:
# Correlation matrix of top features
top_feature_names = importance_df.head(15)['feature'].tolist() + ['demand']
top_features_data = engineered_data[top_feature_names]

plt.figure(figsize=(12, 10))
correlation_matrix = top_features_data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix: Top 15 Features + Target')
plt.tight_layout()
plt.show()

# Feature engineering validation
print("\n✅ Feature Engineering Validation:")
print(f"   • No missing values: {engineered_data.isnull().sum().sum() == 0}")
print(f"   • All features numeric: {len(engineered_data.select_dtypes(include=[np.number]).columns) == len(engineered_data.columns)}")
print(f"   • Feature variance > 0: {(engineered_data.var() > 0).all()}")
print(f"   • No infinite values: {np.isfinite(engineered_data.select_dtypes(include=[np.number])).all().all()}")

# Data quality check
print("\n🔍 Data Quality Summary:")
print(f"   • Shape: {engineered_data.shape}")
print(f"   • Memory usage: {engineered_data.memory_usage(deep=True).sum() / 1024:.1f} KB")
print(f"   • Duplicate rows: {engineered_data.duplicated().sum()}")

## 7. Save Engineered Features

In [None]:
# Save engineered dataset
import os
from datetime import datetime

# Create processed data directory
os.makedirs('../data/processed', exist_ok=True)

# Save main engineered dataset
engineered_data.to_csv('../data/processed/engineered_features.csv', index=False)

# Save feature importance
importance_df.to_csv('../data/processed/feature_importance.csv', index=False)

# Save feature metadata
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
metadata_file = f'../data/processed/feature_metadata_{timestamp}.txt'

with open(metadata_file, 'w') as f:
    f.write("FEATURE ENGINEERING METADATA\n")
    f.write("=" * 40 + "\n\n")
    f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Original features: {len(original_features)}\n")
    f.write(f"Engineered features: {engineered_data.shape[1] - len(original_features)}\n")
    f.write(f"Total features: {engineered_data.shape[1]}\n")
    f.write(f"Data points: {engineered_data.shape[0]}\n\n")
    
    f.write("FEATURE CATEGORIES:\n")
    f.write(f"• Temporal: {len(temporal_features)}\n")
    f.write(f"• Weather: {len(weather_features)}\n")
    f.write(f"• Derived: {len(derived_features)}\n")
    f.write(f"• Interaction: {len(interaction_features)}\n\n")
    
    f.write("TOP 10 FEATURES:\n")
    for i, row in importance_df.head(10).iterrows():
        f.write(f"{row['importance_rank']:2d}. {row['feature']:30s} ({row['abs_correlation']:.3f})\n")

print("💾 Files Saved:")
print(f"   • Engineered features: ../data/processed/engineered_features.csv")
print(f"   • Feature importance: ../data/processed/feature_importance.csv")
print(f"   • Metadata: {metadata_file}")

print(f"\n🎯 Feature Engineering Complete!")
print(f"   Ready for model training with {engineered_data.shape[1]} features")
print(f"   Top predictor: {importance_df.iloc[0]['feature']} (r={importance_df.iloc[0]['abs_correlation']:.3f})")