# Exploratory Data Analysis (EDA)
## Mall Movement Tracking Dataset

This notebook performs comprehensive exploratory data analysis on the processed mall movement tracking dataset.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys

# Add parent directory to path
sys.path.append(str(Path().resolve().parent))

# Import data loader
from streamlit_app.utils.data_loader import load_processed_data

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


## 1. Load Data


In [1]:
# Load processed data
df = load_processed_data()

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


NameError: name 'load_processed_data' is not defined

In [None]:
# Visualize domain-specific patterns
if zone_cols and user_cols:
    zone_col = zone_cols[0]
    user_col = user_cols[0]
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))
    
    # Zone popularity
    zone_counts = df[zone_col].value_counts().head(15)
    axes[0, 0].barh(range(len(zone_counts)), zone_counts.values, color='steelblue')
    axes[0, 0].set_yticks(range(len(zone_counts)))
    axes[0, 0].set_yticklabels(zone_counts.index)
    axes[0, 0].set_xlabel('Visit Count')
    axes[0, 0].set_title('Top 15 Most Popular Zones')
    axes[0, 0].invert_yaxis()
    
    # User activity distribution
    user_activity = df[user_col].value_counts()
    axes[0, 1].hist(user_activity.values, bins=50, color='coral', edgecolor='black')
    axes[0, 1].set_xlabel('Number of Records per User')
    axes[0, 1].set_ylabel('Number of Users')
    axes[0, 1].set_title('User Activity Distribution')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Zone-user heatmap (top zones and users)
    top_zones = df[zone_col].value_counts().head(10).index
    top_users = df[user_col].value_counts().head(10).index
    df_filtered = df[df[zone_col].isin(top_zones) & df[user_col].isin(top_users)]
    heatmap_data = df_filtered.groupby([zone_col, user_col]).size().unstack(fill_value=0)
    sns.heatmap(heatmap_data, ax=axes[1, 0], cmap='YlOrRd', cbar_kws={'label': 'Visit Count'})
    axes[1, 0].set_title('Zone-User Interaction Heatmap (Top 10)')
    axes[1, 0].set_xlabel('User')
    axes[1, 0].set_ylabel('Zone')
    
    # Visits per zone distribution
    visits_per_zone = df.groupby(zone_col).size()
    axes[1, 1].hist(visits_per_zone.values, bins=30, color='lightgreen', edgecolor='black')
    axes[1, 1].set_xlabel('Number of Visits per Zone')
    axes[1, 1].set_ylabel('Number of Zones')
    axes[1, 1].set_title('Distribution of Visits per Zone')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
