# Stage 08: Exploratory Data Analysis (EDA)

## Overview
This notebook performs comprehensive exploratory data analysis on the bike demand dataset to understand:
- Dataset structure and distributions
- Relationships between variables
- Data patterns and outliers
- Statistical summaries and insights

## Dataset Description
- **Source**: Bike demand data with hourly observations
- **Target Variable**: `demand` - Number of bike rentals
- **Features**: `hour`, `temperature`, `humidity`, `day_of_week`
- **Time Period**: Multiple days across different hours

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("📊 EDA Libraries Loaded Successfully")

In [None]:
# Load the dataset
data = pd.read_csv('../data/sample-data.csv')

print("📈 Dataset Loaded")
print(f"Shape: {data.shape}")
print(f"Columns: {data.columns.tolist()}")
print("\nFirst 5 rows:")
data.head()

## 1. Dataset Structure and Basic Information

In [None]:
# Basic dataset information
print("📋 Dataset Information")
print("=" * 50)
print(f"Number of observations: {len(data)}")
print(f"Number of features: {len(data.columns)}")
print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024:.2f} KB")

print("\n📊 Data Types:")
print(data.dtypes)

print("\n🔍 Missing Values:")
missing_values = data.isnull().sum()
print(missing_values)
print(f"Total missing values: {missing_values.sum()}")

## 2. Statistical Summaries

In [None]:
# Comprehensive statistical summary
print("📈 Descriptive Statistics")
print("=" * 50)
summary_stats = data.describe()
print(summary_stats)

# Additional statistics
print("\n📊 Additional Statistics:")
additional_stats = pd.DataFrame({
    'Skewness': data.select_dtypes(include=[np.number]).skew(),
    'Kurtosis': data.select_dtypes(include=[np.number]).kurtosis(),
    'Range': data.select_dtypes(include=[np.number]).max() - data.select_dtypes(include=[np.number]).min(),
    'IQR': data.select_dtypes(include=[np.number]).quantile(0.75) - data.select_dtypes(include=[np.number]).quantile(0.25)
})
print(additional_stats)

## 3. Distribution Analysis

In [None]:
# Distribution plots for all numeric variables
numeric_cols = data.select_dtypes(include=[np.number]).columns
n_cols = len(numeric_cols)
n_rows = (n_cols + 1) // 2

fig, axes = plt.subplots(n_rows, 2, figsize=(15, 4*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes]

for i, col in enumerate(numeric_cols):
    # Histogram with KDE
    sns.histplot(data[col], kde=True, ax=axes[i], alpha=0.7)
    axes[i].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    
    # Add statistics text
    mean_val = data[col].mean()
    median_val = data[col].median()
    std_val = data[col].std()
    skew_val = data[col].skew()
    
    stats_text = f'Mean: {mean_val:.2f}\nMedian: {median_val:.2f}\nStd: {std_val:.2f}\nSkew: {skew_val:.2f}'
    axes[i].text(0.7, 0.7, stats_text, transform=axes[i].transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle('📊 Variable Distributions', fontsize=16, fontweight='bold', y=1.02)
plt.show()

## 4. Box Plots for Outlier Detection

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.boxplot(y=data[col], ax=axes[i])
    axes[i].set_title(f'Box Plot: {col}', fontsize=12, fontweight='bold')
    axes[i].set_ylabel(col)
    
    # Calculate and display outlier statistics
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col]
    
    axes[i].text(0.02, 0.98, f'Outliers: {len(outliers)}', transform=axes[i].transAxes,
                bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.7),
                verticalalignment='top')

plt.tight_layout()
plt.suptitle('📦 Box Plots for Outlier Detection', fontsize=16, fontweight='bold', y=1.02)
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = data.corr()

# Create correlation heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('🔗 Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Print strongest correlations
print("🔗 Strongest Correlations with Demand:")
demand_corr = correlation_matrix['demand'].abs().sort_values(ascending=False)
for var, corr in demand_corr.items():
    if var != 'demand':
        print(f"{var}: {corr:.3f}")

## 6. Scatter Plot Analysis

In [None]:
# Scatter plots of features vs demand
feature_cols = [col for col in numeric_cols if col != 'demand']
n_features = len(feature_cols)
n_rows = (n_features + 1) // 2

fig, axes = plt.subplots(n_rows, 2, figsize=(15, 4*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes]

for i, col in enumerate(feature_cols):
    sns.scatterplot(data=data, x=col, y='demand', ax=axes[i], alpha=0.7)
    
    # Add trend line
    z = np.polyfit(data[col], data['demand'], 1)
    p = np.poly1d(z)
    axes[i].plot(data[col], p(data[col]), "r--", alpha=0.8)
    
    # Calculate and display correlation
    corr = data[col].corr(data['demand'])
    axes[i].set_title(f'{col} vs Demand (r={corr:.3f})', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Demand')

# Remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle('📈 Feature vs Demand Relationships', fontsize=16, fontweight='bold', y=1.02)
plt.show()

## 7. Time Series Analysis

In [None]:
# Time series analysis by hour
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Demand by hour
hourly_demand = data.groupby('hour')['demand'].agg(['mean', 'std', 'count']).reset_index()
axes[0,0].plot(hourly_demand['hour'], hourly_demand['mean'], marker='o', linewidth=2)
axes[0,0].fill_between(hourly_demand['hour'], 
                       hourly_demand['mean'] - hourly_demand['std'],
                       hourly_demand['mean'] + hourly_demand['std'], alpha=0.3)
axes[0,0].set_title('🕐 Average Demand by Hour', fontsize=12, fontweight='bold')
axes[0,0].set_xlabel('Hour')
axes[0,0].set_ylabel('Average Demand')
axes[0,0].grid(True, alpha=0.3)

# Demand by day of week
daily_demand = data.groupby('day_of_week')['demand'].agg(['mean', 'std', 'count']).reset_index()
axes[0,1].bar(daily_demand['day_of_week'], daily_demand['mean'], 
              yerr=daily_demand['std'], capsize=5, alpha=0.7)
axes[0,1].set_title('📅 Average Demand by Day of Week', fontsize=12, fontweight='bold')
axes[0,1].set_xlabel('Day of Week')
axes[0,1].set_ylabel('Average Demand')
axes[0,1].grid(True, alpha=0.3)

# Temperature vs Demand over time
axes[1,0].scatter(data.index, data['demand'], alpha=0.6, label='Demand', s=30)
ax2 = axes[1,0].twinx()
ax2.plot(data.index, data['temperature'], color='red', alpha=0.7, label='Temperature')
axes[1,0].set_title('🌡️ Demand and Temperature Over Time', fontsize=12, fontweight='bold')
axes[1,0].set_xlabel('Observation Index')
axes[1,0].set_ylabel('Demand', color='blue')
ax2.set_ylabel('Temperature', color='red')
axes[1,0].grid(True, alpha=0.3)

# Humidity distribution by demand quartiles
data['demand_quartile'] = pd.qcut(data['demand'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
sns.boxplot(data=data, x='demand_quartile', y='humidity', ax=axes[1,1])
axes[1,1].set_title('💧 Humidity by Demand Quartiles', fontsize=12, fontweight='bold')
axes[1,1].set_xlabel('Demand Quartile')
axes[1,1].set_ylabel('Humidity')

plt.tight_layout()
plt.suptitle('⏰ Time Series and Temporal Patterns', fontsize=16, fontweight='bold', y=1.02)
plt.show()

## 8. Key Insights and Findings

In [None]:
# Generate key insights
print("🔍 KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

# 1. Dataset characteristics
print("\n📊 Dataset Characteristics:")
print(f"• Total observations: {len(data)}")
print(f"• Features: {len(data.columns)} ({', '.join(data.columns)})")
print(f"• Missing values: {data.isnull().sum().sum()} (0%)")
print(f"• Data quality: Clean dataset with no missing values")

# 2. Target variable analysis
print("\n🎯 Target Variable (Demand) Analysis:")
demand_stats = data['demand'].describe()
print(f"• Range: {demand_stats['min']:.0f} - {demand_stats['max']:.0f}")
print(f"• Mean: {demand_stats['mean']:.1f}, Median: {demand_stats['50%']:.1f}")
print(f"• Standard deviation: {demand_stats['std']:.1f}")
print(f"• Skewness: {data['demand'].skew():.3f} (slightly skewed)")

# 3. Feature relationships
print("\n🔗 Feature Relationships:")
corr_with_demand = data.corr()['demand'].abs().sort_values(ascending=False)
for feature, corr in corr_with_demand.items():
    if feature != 'demand':
        strength = "Strong" if corr > 0.7 else "Moderate" if corr > 0.4 else "Weak"
        print(f"• {feature}: {corr:.3f} ({strength} correlation)")

# 4. Temporal patterns
print("\n⏰ Temporal Patterns:")
peak_hour = data.groupby('hour')['demand'].mean().idxmax()
peak_demand = data.groupby('hour')['demand'].mean().max()
print(f"• Peak demand hour: {peak_hour}:00 (avg demand: {peak_demand:.0f})")

best_day = data.groupby('day_of_week')['demand'].mean().idxmax()
best_day_demand = data.groupby('day_of_week')['demand'].mean().max()
print(f"• Best performing day: Day {best_day} (avg demand: {best_day_demand:.0f})")

# 5. Outlier analysis
print("\n📦 Outlier Analysis:")
for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = data[(data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)]
    print(f"• {col}: {len(outliers)} outliers ({len(outliers)/len(data)*100:.1f}%)")

## 9. Recommendations for Feature Engineering

In [None]:
print("\n🛠️ RECOMMENDATIONS FOR FEATURE ENGINEERING")
print("=" * 60)

print("\n1. 🕐 Temporal Features:")
print("   • Create hour categories (morning, afternoon, evening, night)")
print("   • Add weekend/weekday indicator")
print("   • Consider cyclical encoding for hour (sin/cos transformation)")

print("\n2. 🌡️ Weather Interaction Features:")
print("   • Temperature-humidity interaction term")
print("   • Weather comfort index combining temp and humidity")
print("   • Temperature categories (cold, mild, warm, hot)")

print("\n3. 📈 Derived Features:")
print("   • Rolling averages for demand (3-hour, 6-hour windows)")
print("   • Lag features (previous hour demand)")
print("   • Rate of change in temperature and humidity")

print("\n4. 🎯 Target Engineering:")
print("   • Consider log transformation if needed for normality")
print("   • Create demand categories for classification tasks")
print("   • Demand per capita or normalized metrics")

print("\n5. 🔧 Preprocessing Steps:")
print("   • Standard scaling for continuous variables")
print("   • One-hot encoding for categorical variables")
print("   • Handle any future missing values with median imputation")

## 10. Save Results and Summary

In [None]:
# Save EDA summary to file
import os
from datetime import datetime

# Create results directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save statistical summary
summary_stats.to_csv('../data/processed/eda_statistical_summary.csv')
correlation_matrix.to_csv('../data/processed/eda_correlation_matrix.csv')

# Save insights summary
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
insights_file = f'../data/processed/eda_insights_{timestamp}.txt'

with open(insights_file, 'w') as f:
    f.write("EXPLORATORY DATA ANALYSIS SUMMARY\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Dataset: Bike Demand Data\n")
    f.write(f"Observations: {len(data)}\n")
    f.write(f"Features: {len(data.columns)}\n\n")
    
    f.write("KEY FINDINGS:\n")
    f.write(f"• Strongest predictor: {corr_with_demand.index[1]} (r={corr_with_demand.iloc[1]:.3f})\n")
    f.write(f"• Peak demand hour: {peak_hour}:00\n")
    f.write(f"• Data quality: Clean, no missing values\n")
    f.write(f"• Outliers: Minimal across all variables\n")

print(f"✅ EDA Results Saved:")
print(f"   • Statistical summary: ../data/processed/eda_statistical_summary.csv")
print(f"   • Correlation matrix: ../data/processed/eda_correlation_matrix.csv")
print(f"   • Insights summary: {insights_file}")
print(f"\n🎯 EDA Complete! Ready for Feature Engineering stage.")