# Modeling Experiments
## Mall Movement Tracking - ML Model Training and Comparison

This notebook performs comprehensive ML model training experiments:
- Classification models (Random Forest, Decision Tree, XGBoost)
- Clustering models (K-Means, DBSCAN)
- Forecasting models (ARIMA, Prophet)
- Model comparison and evaluation


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys
import os
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, roc_auc_score, silhouette_score,
                             mean_squared_error, mean_absolute_error)

# Add project root to path
project_root = Path().resolve()
if project_root.name == 'notebooks':
    project_root = project_root.parent
sys.path.insert(0, str(project_root))

# Import project modules
from streamlit_app.utils.data_loader import load_processed_data
from features.feature_engineering import FeatureEngineer

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Project root: {project_root}")


## 1. Load and Prepare Data


In [None]:
# Load processed data
print("Loading processed data...")
df_original = load_processed_data()
print(f"Original data shape: {df_original.shape}")

# Apply feature engineering
print("\nApplying feature engineering...")
fe = FeatureEngineer()
df = fe.engineer_features(df_original)
print(f"Engineered data shape: {df.shape}")
print(f"New features created: {len(df.columns) - len(df_original.columns)}")

# Display basic info
print("\nData Summary:")
print(f"  - Rows: {len(df):,}")
print(f"  - Columns: {len(df.columns)}")
print(f"  - Missing values: {df.isnull().sum().sum():,}")
print(f"  - Numeric columns: {len(df.select_dtypes(include=[np.number]).columns)}")


## 2. Classification Models - Zone Prediction


In [None]:
# Prepare data for classification
print("=" * 60)
print("CLASSIFICATION MODELS")
print("=" * 60)

# Auto-detect target column
zone_cols = [col for col in df.columns if 'zone' in col.lower() or 'location' in col.lower() or 'space' in col.lower()]
target_col = zone_cols[0] if zone_cols else df.columns[-1]

print(f"\nTarget column: {target_col}")
print(f"Unique values: {df[target_col].nunique()}")

# Select features
exclude_cols = ['target', target_col] + [col for col in df.columns if df[col].dtype == 'object']
feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in [np.number]]

print(f"\nFeature columns: {len(feature_cols)}")

# Prepare X and y
X = df[feature_cols].fillna(0)
y = df[target_col]

# Encode target if categorical
if y.dtype == 'object':
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y = y_encoded
    print(f"Target encoded: {len(le.classes_)} classes")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


In [None]:
# Train Random Forest
print("\n" + "-" * 60)
print("Training Random Forest...")
print("-" * 60)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)

rf_accuracy = accuracy_score(y_test, rf_pred)
if len(np.unique(y_test)) > 2:
    rf_roc_auc = roc_auc_score(y_test, rf_pred_proba, multi_class='ovr')
else:
    rf_roc_auc = roc_auc_score(y_test, rf_pred_proba[:, 1])

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Random Forest ROC-AUC: {rf_roc_auc:.4f}")

# Feature importance
rf_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 Important Features:")
display(rf_importance.head(10))


In [None]:
# Train Decision Tree
print("\n" + "-" * 60)
print("Training Decision Tree...")
print("-" * 60)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

dt_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)

print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")


In [None]:
# Train XGBoost
print("\n" + "-" * 60)
print("Training XGBoost...")
print("-" * 60)

xgb_model = XGBClassifier(random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)

xgb_accuracy = accuracy_score(y_test, xgb_pred)
if len(np.unique(y_test)) > 2:
    xgb_roc_auc = roc_auc_score(y_test, xgb_pred_proba, multi_class='ovr')
else:
    xgb_roc_auc = roc_auc_score(y_test, xgb_pred_proba[:, 1])

print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"XGBoost ROC-AUC: {xgb_roc_auc:.4f}")


In [None]:
# Classification Results Summary
classification_results = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'XGBoost'],
    'Accuracy': [rf_accuracy, dt_accuracy, xgb_accuracy],
    'ROC-AUC': [rf_roc_auc, np.nan, xgb_roc_auc]
})

print("\n" + "=" * 60)
print("CLASSIFICATION RESULTS SUMMARY")
print("=" * 60)
display(classification_results)

# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
axes[0].bar(classification_results['Model'], classification_results['Accuracy'], 
           color=['steelblue', 'coral', 'lightgreen'], alpha=0.7)
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Classification Model Accuracy Comparison')
axes[0].set_ylim([0, 1])
for i, v in enumerate(classification_results['Accuracy']):
    axes[0].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# ROC-AUC comparison
roc_data = classification_results[classification_results['ROC-AUC'].notna()]
axes[1].bar(roc_data['Model'], roc_data['ROC-AUC'], 
           color=['steelblue', 'lightgreen'], alpha=0.7)
axes[1].set_ylabel('ROC-AUC')
axes[1].set_title('Classification Model ROC-AUC Comparison')
axes[1].set_ylim([0, 1])
for i, v in enumerate(roc_data['ROC-AUC']):
    axes[1].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


## 3. Clustering Models - Customer Segmentation


In [None]:
# Prepare data for clustering
print("=" * 60)
print("CLUSTERING MODELS")
print("=" * 60)

# Select numeric features
clustering_features = [col for col in df.columns if df[col].dtype in [np.number]]
X_cluster = df[clustering_features].fillna(0)

print(f"\nFeatures for clustering: {len(clustering_features)}")
print(f"Data shape: {X_cluster.shape}")

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

print("Features scaled using StandardScaler")


In [None]:
# Train K-Means
print("\n" + "-" * 60)
print("Training K-Means...")
print("-" * 60)

n_clusters = 5
kmeans_model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans_labels = kmeans_model.fit_predict(X_scaled)

kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)

print(f"K-Means Clusters: {n_clusters}")
print(f"K-Means Silhouette Score: {kmeans_silhouette:.4f}")
print(f"Cluster distribution:")
for i in range(n_clusters):
    count = np.sum(kmeans_labels == i)
    print(f"  Cluster {i}: {count:,} customers ({count/len(kmeans_labels)*100:.1f}%)")


In [None]:
# Train DBSCAN
print("\n" + "-" * 60)
print("Training DBSCAN...")
print("-" * 60)

dbscan_model = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan_model.fit_predict(X_scaled)

n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

print(f"DBSCAN Clusters found: {n_clusters_dbscan}")
print(f"Noise points: {n_noise:,} ({n_noise/len(dbscan_labels)*100:.1f}%)")

if n_clusters_dbscan > 1:
    dbscan_silhouette = silhouette_score(X_scaled, dbscan_labels)
    print(f"DBSCAN Silhouette Score: {dbscan_silhouette:.4f}")
else:
    dbscan_silhouette = -1
    print("DBSCAN Silhouette Score: N/A (too few clusters)")


In [None]:
# Clustering Results Summary
clustering_results = pd.DataFrame({
    'Model': ['K-Means', 'DBSCAN'],
    'Clusters': [n_clusters, n_clusters_dbscan],
    'Silhouette Score': [kmeans_silhouette, dbscan_silhouette if dbscan_silhouette != -1 else np.nan],
    'Noise Points': [0, n_noise]
})

print("\n" + "=" * 60)
print("CLUSTERING RESULTS SUMMARY")
print("=" * 60)
display(clustering_results)

# Visualize cluster distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# K-Means cluster distribution
kmeans_counts = pd.Series(kmeans_labels).value_counts().sort_index()
axes[0].bar(range(len(kmeans_counts)), kmeans_counts.values, color='steelblue', alpha=0.7)
axes[0].set_xlabel('Cluster')
axes[0].set_ylabel('Number of Customers')
axes[0].set_title(f'K-Means Cluster Distribution (Silhouette: {kmeans_silhouette:.3f})')
axes[0].set_xticks(range(len(kmeans_counts)))
axes[0].set_xticklabels([f'C{i}' for i in kmeans_counts.index])

# DBSCAN cluster distribution
dbscan_counts = pd.Series(dbscan_labels).value_counts().sort_index()
if -1 in dbscan_counts.index:
    # Separate noise from clusters
    noise_count = dbscan_counts[-1]
    cluster_counts = dbscan_counts[dbscan_counts.index != -1]
    x_pos = list(range(len(cluster_counts))) + [len(cluster_counts)]
    values = list(cluster_counts.values) + [noise_count]
    labels = [f'C{i}' for i in cluster_counts.index] + ['Noise']
    colors = ['coral'] * len(cluster_counts) + ['gray']
else:
    x_pos = range(len(dbscan_counts))
    values = dbscan_counts.values
    labels = [f'C{i}' for i in dbscan_counts.index]
    colors = ['coral'] * len(dbscan_counts)

axes[1].bar(x_pos, values, color=colors, alpha=0.7)
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Number of Customers')
axes[1].set_title(f'DBSCAN Cluster Distribution (Clusters: {n_clusters_dbscan}, Noise: {n_noise})')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(labels, rotation=45)

plt.tight_layout()
plt.show()


## 4. Forecasting Models - Traffic Prediction


In [None]:
# Prepare time series data for forecasting
print("=" * 60)
print("FORECASTING MODELS")
print("=" * 60)

# Detect datetime column
datetime_cols = df.select_dtypes(include=['datetime64']).columns
if len(datetime_cols) == 0:
    # Try to convert
    for col in ['TIMESTAMP', 'timestamp', 'date', 'time']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            datetime_cols = [col]
            break

if len(datetime_cols) > 0:
    datetime_col = datetime_cols[0]
    print(f"\nDatetime column: {datetime_col}")
    
    # Find numeric column for forecasting
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    value_col = numeric_cols[0] if len(numeric_cols) > 0 else None
    
    if value_col:
        print(f"Value column: {value_col}")
        
        # Create time series
        df[datetime_col] = pd.to_datetime(df[datetime_col])
        df_sorted = df.sort_values(datetime_col)
        
        # Aggregate by datetime
        ts_df = df_sorted.groupby(df_sorted[datetime_col].dt.date)[value_col].sum().reset_index()
        ts_df.columns = ['ds', 'y']
        ts_df['ds'] = pd.to_datetime(ts_df['ds'])
        
        print(f"\nTime series shape: {ts_df.shape}")
        print(f"Date range: {ts_df['ds'].min()} to {ts_df['ds'].max()}")
        print(f"Total days: {(ts_df['ds'].max() - ts_df['ds'].min()).days}")
    else:
        print("No suitable value column found for forecasting")
        ts_df = None
else:
    print("No datetime column found for forecasting")
    ts_df = None


In [None]:
# Train ARIMA model (if time series data available)
if ts_df is not None and len(ts_df) > 50:
    try:
        from statsmodels.tsa.arima.model import ARIMA
        
        print("\n" + "-" * 60)
        print("Training ARIMA...")
        print("-" * 60)
        
        # Split data
        train_size = int(len(ts_df) * 0.8)
        train = ts_df[:train_size]['y'].values
        test = ts_df[train_size:]['y'].values
        
        # Fit ARIMA
        arima_model = ARIMA(train, order=(1, 1, 1))
        arima_fitted = arima_model.fit()
        
        # Forecast
        forecast = arima_fitted.forecast(steps=len(test))
        
        # Calculate metrics
        arima_rmse = np.sqrt(mean_squared_error(test, forecast))
        arima_mae = mean_absolute_error(test, forecast)
        
        print(f"ARIMA RMSE: {arima_rmse:.4f}")
        print(f"ARIMA MAE: {arima_mae:.4f}")
        
        arima_success = True
    except Exception as e:
        print(f"ARIMA training failed: {e}")
        arima_success = False
        arima_rmse = np.nan
        arima_mae = np.nan
else:
    print("Insufficient data for ARIMA forecasting")
    arima_success = False
    arima_rmse = np.nan
    arima_mae = np.nan


In [None]:
# Train Prophet model (if time series data available)
if ts_df is not None and len(ts_df) > 50:
    try:
        from prophet import Prophet
        
        print("\n" + "-" * 60)
        print("Training Prophet...")
        print("-" * 60)
        
        # Split data
        train_size = int(len(ts_df) * 0.8)
        train = ts_df[:train_size].copy()
        test = ts_df[train_size:].copy()
        
        # Fit Prophet
        prophet_model = Prophet()
        prophet_model.fit(train)
        
        # Forecast
        future = prophet_model.make_future_dataframe(periods=len(test))
        forecast_df = prophet_model.predict(future)
        
        # Get forecasted values for test period
        forecasted = forecast_df[-len(test):]['yhat'].values
        actual = test['y'].values
        
        # Calculate metrics
        prophet_rmse = np.sqrt(mean_squared_error(actual, forecasted))
        prophet_mae = mean_absolute_error(actual, forecasted)
        
        print(f"Prophet RMSE: {prophet_rmse:.4f}")
        print(f"Prophet MAE: {prophet_mae:.4f}")
        
        prophet_success = True
    except Exception as e:
        print(f"Prophet training failed: {e}")
        prophet_success = False
        prophet_rmse = np.nan
        prophet_mae = np.nan
else:
    print("Insufficient data for Prophet forecasting")
    prophet_success = False
    prophet_rmse = np.nan
    prophet_mae = np.nan


In [None]:
# Forecasting Results Summary
if ts_df is not None:
    forecasting_results = pd.DataFrame({
        'Model': ['ARIMA', 'Prophet'],
        'RMSE': [arima_rmse, prophet_rmse],
        'MAE': [arima_mae, prophet_mae]
    })
    
    print("\n" + "=" * 60)
    print("FORECASTING RESULTS SUMMARY")
    print("=" * 60)
    display(forecasting_results)
    
    # Visualize results
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # RMSE comparison
    rmse_data = forecasting_results[forecasting_results['RMSE'].notna()]
    if len(rmse_data) > 0:
        axes[0].bar(rmse_data['Model'], rmse_data['RMSE'], 
                   color=['steelblue', 'coral'], alpha=0.7)
        axes[0].set_ylabel('RMSE')
        axes[0].set_title('Forecasting Model RMSE Comparison')
        for i, v in enumerate(rmse_data['RMSE']):
            axes[0].text(i, v + max(rmse_data['RMSE']) * 0.02, f'{v:.2f}', 
                        ha='center', fontweight='bold')
    
    # MAE comparison
    mae_data = forecasting_results[forecasting_results['MAE'].notna()]
    if len(mae_data) > 0:
        axes[1].bar(mae_data['Model'], mae_data['MAE'], 
                   color=['steelblue', 'coral'], alpha=0.7)
        axes[1].set_ylabel('MAE')
        axes[1].set_title('Forecasting Model MAE Comparison')
        for i, v in enumerate(mae_data['MAE']):
            axes[1].text(i, v + max(mae_data['MAE']) * 0.02, f'{v:.2f}', 
                         ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
else:
    print("Forecasting models not trained (insufficient time series data)")


## 5. Save Models and Results


In [None]:
# Create directories
MODEL_DIR = project_root / "models"
RESULTS_DIR = project_root / "results"

MODEL_DIR.mkdir(exist_ok=True)
(MODEL_DIR / "classification").mkdir(exist_ok=True)
(MODEL_DIR / "clustering").mkdir(exist_ok=True)
(MODEL_DIR / "forecasting").mkdir(exist_ok=True)
(MODEL_DIR / "preprocessing").mkdir(exist_ok=True)

RESULTS_DIR.mkdir(exist_ok=True)
(RESULTS_DIR / "classification").mkdir(exist_ok=True)
(RESULTS_DIR / "clustering").mkdir(exist_ok=True)
(RESULTS_DIR / "forecasting").mkdir(exist_ok=True)

print("Saving models and results...")

# Save classification models
joblib.dump(rf_model, MODEL_DIR / "classification" / "zone_rf.pkl")
joblib.dump(dt_model, MODEL_DIR / "classification" / "baseline_dt.pkl")
joblib.dump(xgb_model, MODEL_DIR / "classification" / "zone_xgb.pkl")
if 'le' in locals():
    joblib.dump(le, MODEL_DIR / "preprocessing" / "encoder.pkl")
print("✓ Classification models saved")

# Save clustering models
joblib.dump(kmeans_model, MODEL_DIR / "clustering" / "kmeans.pkl")
joblib.dump(dbscan_model, MODEL_DIR / "clustering" / "dbscan.pkl")
joblib.dump(scaler, MODEL_DIR / "preprocessing" / "scaler.pkl")
print("✓ Clustering models saved")

# Save forecasting models (if trained)
if 'arima_success' in locals() and arima_success:
    joblib.dump(arima_fitted, MODEL_DIR / "forecasting" / "arima.pkl")
    print("✓ ARIMA model saved")
if 'prophet_success' in locals() and prophet_success:
    joblib.dump(prophet_model, MODEL_DIR / "forecasting" / "prophet_model.pkl")
    print("✓ Prophet model saved")

# Save results
classification_metrics = {
    'random_forest': {'accuracy': float(rf_accuracy), 'roc_auc': float(rf_roc_auc)},
    'decision_tree': {'accuracy': float(dt_accuracy)},
    'xgboost': {'accuracy': float(xgb_accuracy), 'roc_auc': float(xgb_roc_auc)}
}

clustering_metrics = {
    'kmeans': {
        'n_clusters': int(n_clusters),
        'silhouette_score': float(kmeans_silhouette)
    },
    'dbscan': {
        'n_clusters': int(n_clusters_dbscan),
        'silhouette_score': float(dbscan_silhouette) if dbscan_silhouette != -1 else None,
        'n_noise': int(n_noise)
    }
}

with open(RESULTS_DIR / "classification" / "metrics.json", 'w') as f:
    json.dump(classification_metrics, f, indent=2)
print("✓ Classification metrics saved")

with open(RESULTS_DIR / "clustering" / "silhouette_score.json", 'w') as f:
    json.dump(clustering_metrics, f, indent=2)
print("✓ Clustering metrics saved")

if ts_df is not None:
    forecasting_metrics = {}
    if arima_success:
        forecasting_metrics['arima'] = {
            'rmse': float(arima_rmse),
            'mae': float(arima_mae)
        }
    if prophet_success:
        forecasting_metrics['prophet'] = {
            'rmse': float(prophet_rmse),
            'mae': float(prophet_mae)
        }
    
    if forecasting_metrics:
        with open(RESULTS_DIR / "forecasting" / "rmse.json", 'w') as f:
            json.dump(forecasting_metrics, f, indent=2)
        print("✓ Forecasting metrics saved")

print("\n" + "=" * 60)
print("ALL MODELS AND RESULTS SAVED!")
print("=" * 60)


In [None]:
print("=" * 60)
print("MODELING EXPERIMENTS SUMMARY")
print("=" * 60)

print("\n1. Classification Models:")
print(f"   - Random Forest: Accuracy = {rf_accuracy:.4f}, ROC-AUC = {rf_roc_auc:.4f}")
print(f"   - Decision Tree: Accuracy = {dt_accuracy:.4f}")
print(f"   - XGBoost: Accuracy = {xgb_accuracy:.4f}, ROC-AUC = {xgb_roc_auc:.4f}")

best_classification = max([
    ('Random Forest', rf_accuracy),
    ('Decision Tree', dt_accuracy),
    ('XGBoost', xgb_accuracy)
], key=lambda x: x[1])
print(f"   → Best Model: {best_classification[0]} ({best_classification[1]:.4f})")

print("\n2. Clustering Models:")
print(f"   - K-Means: {n_clusters} clusters, Silhouette = {kmeans_silhouette:.4f}")
print(f"   - DBSCAN: {n_clusters_dbscan} clusters, Silhouette = {dbscan_silhouette:.4f if dbscan_silhouette != -1 else 'N/A'}")

if ts_df is not None:
    print("\n3. Forecasting Models:")
    if arima_success:
        print(f"   - ARIMA: RMSE = {arima_rmse:.4f}, MAE = {arima_mae:.4f}")
    if prophet_success:
        print(f"   - Prophet: RMSE = {prophet_rmse:.4f}, MAE = {prophet_mae:.4f}")

print("\n4. Next Steps:")
print("   - Review model performance metrics")
print("   - Tune hyperparameters for better performance")
print("   - Use models in Streamlit dashboard")
print("   - Deploy models via API")
print("   - Monitor model performance over time")

print("\n" + "=" * 60)
print("Experiments Complete!")
print("=" * 60)
