# Feature Analysis and Engineering
## Mall Movement Tracking Dataset

This notebook performs comprehensive feature engineering and analysis:
- Feature engineering pipeline
- Feature selection
- Feature importance analysis
- Before/after comparison


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys
import os

# Add project root to path
project_root = Path().resolve()
# If we're in notebooks folder, go up one level to project root
if project_root.name == 'notebooks':
    project_root = project_root.parent
sys.path.insert(0, str(project_root))

# Import modules
from streamlit_app.utils.data_loader import load_processed_data
from features.feature_engineering import FeatureEngineer

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Project root: {project_root}")
print(f"Python path includes: {str(project_root)}")


Libraries imported successfully!
Project root: C:\Users\hp\Desktop\mall-movement-tracking ml
Python path includes: C:\Users\hp\Desktop\mall-movement-tracking ml


In [2]:
# Load processed data
df_original = load_processed_data()

print("Original Dataset:")
print("="*60)
print(f"Shape: {df_original.shape}")
print(f"Columns: {len(df_original.columns)}")
print(f"\nColumn names:")
for i, col in enumerate(df_original.columns, 1):
    print(f"  {i:2d}. {col} ({df_original[col].dtype})")


Original Dataset:
Shape: (15839, 80)
Columns: 80

Column names:
   1. USERID (int64)
   2. TIMESTAMP (int64)
   3. WAP011 (int64)
   4. WAP012 (int64)
   5. WAP051 (int64)
   6. WAP052 (int64)
   7. WAP059 (int64)
   8. WAP060 (int64)
   9. WAP061 (int64)
  10. WAP062 (int64)
  11. WAP063 (int64)
  12. WAP064 (int64)
  13. WAP065 (int64)
  14. WAP066 (int64)
  15. WAP069 (int64)
  16. WAP070 (int64)
  17. WAP073 (int64)
  18. WAP074 (int64)
  19. WAP077 (int64)
  20. WAP078 (int64)
  21. WAP082 (int64)
  22. WAP083 (int64)
  23. WAP084 (int64)
  24. WAP085 (int64)
  25. WAP087 (int64)
  26. WAP096 (int64)
  27. WAP097 (int64)
  28. WAP098 (int64)
  29. WAP099 (int64)
  30. WAP117 (int64)
  31. WAP118 (int64)
  32. WAP121 (int64)
  33. WAP122 (int64)
  34. WAP127 (int64)
  35. WAP128 (int64)
  36. WAP131 (int64)
  37. WAP132 (int64)
  38. WAP144 (int64)
  39. WAP145 (int64)
  40. WAP155 (int64)
  41. WAP156 (int64)
  42. WAP161 (int64)
  43. WAP162 (int64)
  44. WAP248 (int64)
  45. WAP

In [None]:
# Initialize feature engineer
fe = FeatureEngineer()

# Detect column types
column_types = fe.detect_column_types(df_original)

print("Detected Column Types:")
print("="*60)
print(f"Datetime columns: {column_types['datetime']}")
print(f"Categorical columns: {column_types['categorical']}")
print(f"Numeric columns: {column_types['numeric']}")
print(f"ID columns: {column_types['id']}")


In [None]:
# Step 1: Handle Missing Values
print("Step 1: Handling Missing Values...")
df_step1 = fe.handle_missing_values(df_original.copy(), strategy='auto')
missing_before = df_original.isnull().sum().sum()
missing_after = df_step1.isnull().sum().sum()
print(f"  Missing values before: {missing_before:,}")
print(f"  Missing values after: {missing_after:,}")
print(f"  âœ“ Missing values handled")


In [None]:
# Correlation analysis for numeric features
if len(numeric_features) > 1:
    # Select top features for correlation (to avoid too large matrix)
    top_features = numeric_features[:15] if len(numeric_features) > 15 else numeric_features
    corr_matrix = df_engineered[top_features].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(14, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix of Engineered Features (Top 15)', fontsize=16, pad=20)
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    print("\nHighly Correlated Feature Pairs (|correlation| > 0.7):")
    print("="*60)
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
    
    if high_corr_pairs:
        for col1, col2, corr in high_corr_pairs[:10]:  # Show top 10
            print(f"{col1:30s} <-> {col2:30s} : {corr:6.3f}")
    else:
        print("No highly correlated pairs found.")
else:
    print("Need at least 2 numeric features for correlation analysis.")


## 9. Feature Importance (Using Random Forest)


In [None]:
# Try to identify target column for feature importance
zone_cols = [col for col in df_engineered.columns if 'zone' in col.lower() or 'location' in col.lower()]
target_col = zone_cols[0] if zone_cols else None

if target_col and target_col in df_engineered.columns:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import LabelEncoder
    
    # Prepare data
    X = df_engineered[numeric_features].fillna(0)
    y = df_engineered[target_col]
    
    # Encode target if categorical
    if y.dtype == 'object':
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    # Train random forest for feature importance
    print("Training Random Forest for Feature Importance...")
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X, y)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': numeric_features,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(f"\nTop 20 Most Important Features:")
    print("="*60)
    display(feature_importance.head(20))
    
    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['Importance'].values, color='steelblue')
    plt.yticks(range(len(top_features)), top_features['Feature'].values)
    plt.xlabel('Importance')
    plt.title('Top 20 Feature Importance (Random Forest)')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("Target column not found. Skipping feature importance analysis.")


In [None]:
# Save engineered features
output_path = Path().resolve().parent / "data" / "processed" / "engineered_features.csv"
df_engineered.to_csv(output_path, index=False)

print(f"Engineered features saved to: {output_path}")
print(f"Shape: {df_engineered.shape}")
print(f"Columns: {len(df_engineered.columns)}")
