# Data Exploration & Metrics - NYC Mobility & Weather Analytics

This notebook explores the transformed data and metrics to answer key questions:
- How does weather impact mobility patterns?
- What are the peak hours for different transportation modes?
- How do yellow taxis compare to CitiBikes?
- What are the most popular pickup/dropoff locations?

**Data Coverage:**
- **Time Period:** September - November 2024 (3 months)
- **Trip Records:** ~12.5M trips
- **Transportation Modes:** Yellow Taxi, FHV (Uber/Lyft), CitiBike
- **Weather Data:** Hourly temperature, precipitation, wind

---
## Setup

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import numpy as np

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Project paths
PROJECT_ROOT = Path.cwd().parent
DB_PATH = PROJECT_ROOT / "data" / "nyc_mobility.duckdb"

print(f"üìç Database: {DB_PATH}")
print(f"{'‚úÖ' if DB_PATH.exists() else '‚ùå'} Database exists")
if DB_PATH.exists():
    print(f"üìä Size: {DB_PATH.stat().st_size / 1024**3:.2f} GB")

# Connect to DuckDB
conn = duckdb.connect(str(DB_PATH), read_only=True)
print("\n‚úÖ Connected to DuckDB")

---
## 1. Data Overview

Let's start by understanding what data we have available

In [None]:
# List all available tables
print("üìã Available Data Tables\n")
print("="*80)

schemas = ['raw_data', 'core', 'core_core']

for schema in schemas:
    print(f"\n{schema.upper()} Schema:")
    tables = conn.execute(f"""
        SELECT table_name, 
               (SELECT COUNT(*) FROM {schema}." || table_name || "") as row_count
        FROM information_schema.tables
        WHERE table_schema = '{schema}'
        ORDER BY table_name
    """).fetchall()
    
    for table, count in tables:
        print(f"  {table:40} {count:>15,} rows")

### Data Quality Summary

In [None]:
# Overall data quality metrics
quality_metrics = conn.execute("""
    SELECT
        COUNT(*) as total_trips,
        COUNT(DISTINCT trip_key) as unique_keys,
        COUNT(*) - COUNT(DISTINCT trip_key) as duplicate_keys,
        SUM(CASE WHEN weather_key IS NOT NULL THEN 1 ELSE 0 END) as trips_with_weather,
        ROUND(100.0 * SUM(CASE WHEN weather_key IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 4) as weather_coverage_pct,
        SUM(CASE WHEN location_key IS NOT NULL THEN 1 ELSE 0 END) as trips_with_location,
        ROUND(100.0 * SUM(CASE WHEN location_key IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 4) as location_coverage_pct,
        MIN(pickup_datetime) as earliest_trip,
        MAX(pickup_datetime) as latest_trip
    FROM core_core.fct_trips
""").fetchdf()

print("üìä Data Quality Summary\n")
print("="*80)

for col in quality_metrics.columns:
    val = quality_metrics[col].iloc[0]
    if 'pct' in col:
        print(f"{col:30} {val:.4f}%")
    elif isinstance(val, (int, np.integer)):
        print(f"{col:30} {val:,}")
    else:
        print(f"{col:30} {val}")

# Calculate time span
earliest = quality_metrics['earliest_trip'].iloc[0]
latest = quality_metrics['latest_trip'].iloc[0]
days = (latest - earliest).days
print(f"\nData spans {days} days ({earliest.strftime('%Y-%m-%d')} to {latest.strftime('%Y-%m-%d')})")

---
## 2. Transportation Mode Analysis

Compare different modes of transportation (Yellow Taxi, FHV, CitiBike)

In [None]:
# Trip counts by mode
mode_stats = conn.execute("""
    SELECT
        trip_type,
        COUNT(*) as trip_count,
        AVG(trip_distance_miles) as avg_distance,
        AVG(trip_duration_minutes) as avg_duration,
        AVG(fare_amount) as avg_fare,
        SUM(fare_amount) as total_revenue
    FROM core_core.fct_trips
    GROUP BY trip_type
    ORDER BY trip_count DESC
""").fetchdf()

print("üöï Transportation Mode Statistics\n")
display(mode_stats)

# Visualize trip distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Trip count by mode
axes[0].bar(mode_stats['trip_type'], mode_stats['trip_count'])
axes[0].set_title('Trip Count by Transportation Mode', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Number of Trips')
axes[0].tick_params(axis='x', rotation=45)
for i, v in enumerate(mode_stats['trip_count']):
    axes[0].text(i, v, f'{v/1e6:.1f}M', ha='center', va='bottom')

# Average metrics by mode
x = np.arange(len(mode_stats))
width = 0.25
axes[1].bar(x - width, mode_stats['avg_distance'], width, label='Distance (mi)')
axes[1].bar(x, mode_stats['avg_duration'], width, label='Duration (min)')
axes[1].bar(x + width, mode_stats['avg_fare'], width, label='Fare ($)')
axes[1].set_title('Average Metrics by Mode', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(mode_stats['trip_type'], rotation=45)
axes[1].legend()

plt.tight_layout()
plt.show()

---
## 3. Temporal Patterns

Analyze how mobility patterns change over time

In [None]:
# Hourly patterns
hourly_pattern = conn.execute("""
    SELECT
        t.hour_of_day,
        ft.trip_type,
        COUNT(*) as trip_count
    FROM core_core.fct_trips ft
    JOIN core_core.dim_time t ON ft.time_key = t.time_key
    GROUP BY t.hour_of_day, ft.trip_type
    ORDER BY t.hour_of_day, ft.trip_type
""").fetchdf()

# Pivot for easier plotting
hourly_pivot = hourly_pattern.pivot(index='hour_of_day', columns='trip_type', values='trip_count')

# Plot hourly patterns
fig, ax = plt.subplots(figsize=(14, 6))
hourly_pivot.plot(ax=ax, linewidth=2)
ax.set_title('Hourly Trip Patterns by Transportation Mode', fontsize=14, fontweight='bold')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Number of Trips')
ax.grid(True, alpha=0.3)
ax.legend(title='Mode', loc='best')
plt.tight_layout()
plt.show()

# Find peak hours for each mode
print("\nüïê Peak Hours by Mode:\n")
for mode in hourly_pivot.columns:
    peak_hour = hourly_pivot[mode].idxmax()
    peak_count = hourly_pivot[mode].max()
    print(f"{mode:15} Peak at {peak_hour:02d}:00 with {peak_count:,} trips")

In [None]:
# Day of week patterns
dow_pattern = conn.execute("""
    SELECT
        d.day_of_week_name,
        d.day_of_week,
        ft.trip_type,
        COUNT(*) as trip_count
    FROM core_core.fct_trips ft
    JOIN core_core.dim_date d ON ft.date_key = d.date_key
    GROUP BY d.day_of_week_name, d.day_of_week, ft.trip_type
    ORDER BY d.day_of_week, ft.trip_type
""").fetchdf()

# Pivot and plot
dow_pivot = dow_pattern.pivot(index='day_of_week_name', columns='trip_type', values='trip_count')

# Reorder days
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_pivot = dow_pivot.reindex([d for d in day_order if d in dow_pivot.index])

fig, ax = plt.subplots(figsize=(12, 6))
dow_pivot.plot(kind='bar', ax=ax)
ax.set_title('Trip Patterns by Day of Week', fontsize=14, fontweight='bold')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Number of Trips')
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Mode')
plt.tight_layout()
plt.show()

In [None]:
# Daily trend over time
daily_trend = conn.execute("""
    SELECT
        d.date_actual,
        ft.trip_type,
        COUNT(*) as trip_count
    FROM core_core.fct_trips ft
    JOIN core_core.dim_date d ON ft.date_key = d.date_key
    GROUP BY d.date_actual, ft.trip_type
    ORDER BY d.date_actual, ft.trip_type
""").fetchdf()

daily_pivot = daily_trend.pivot(index='date_actual', columns='trip_type', values='trip_count')

fig, ax = plt.subplots(figsize=(14, 6))
daily_pivot.plot(ax=ax, alpha=0.7)
ax.set_title('Daily Trip Volume Trend', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Number of Trips')
ax.legend(title='Mode')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---
## 4. Weather Impact Analysis

How does weather affect mobility patterns?

In [None]:
# Trip counts by weather conditions
weather_impact = conn.execute("""
    SELECT
        CASE
            WHEN w.temperature_2m < 50 THEN 'Cold (<50¬∞F)'
            WHEN w.temperature_2m < 70 THEN 'Mild (50-70¬∞F)'
            ELSE 'Warm (>70¬∞F)'
        END as temp_range,
        CASE
            WHEN w.precipitation > 0 THEN 'Rain'
            ELSE 'No Rain'
        END as precipitation_status,
        ft.trip_type,
        COUNT(*) as trip_count,
        AVG(ft.trip_duration_minutes) as avg_duration
    FROM core_core.fct_trips ft
    JOIN core_core.dim_weather w ON ft.weather_key = w.weather_key
    WHERE ft.weather_key IS NOT NULL
    GROUP BY temp_range, precipitation_status, ft.trip_type
    ORDER BY temp_range, precipitation_status, ft.trip_type
""").fetchdf()

print("üå§Ô∏è Weather Impact on Trips\n")
print("="*80)
display(weather_impact.head(20))

In [None]:
# Temperature vs trip volume
temp_analysis = conn.execute("""
    SELECT
        ROUND(w.temperature_2m / 5) * 5 as temp_bucket,
        COUNT(*) as trip_count,
        AVG(ft.trip_distance_miles) as avg_distance
    FROM core_core.fct_trips ft
    JOIN core_core.dim_weather w ON ft.weather_key = w.weather_key
    WHERE ft.weather_key IS NOT NULL
    GROUP BY temp_bucket
    ORDER BY temp_bucket
""").fetchdf()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Trip count vs temperature
axes[0].scatter(temp_analysis['temp_bucket'], temp_analysis['trip_count'], alpha=0.6, s=100)
axes[0].set_title('Trip Volume vs Temperature', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Temperature (¬∞F)')
axes[0].set_ylabel('Number of Trips')
axes[0].grid(True, alpha=0.3)

# Distance vs temperature
axes[1].scatter(temp_analysis['temp_bucket'], temp_analysis['avg_distance'], alpha=0.6, s=100, color='orange')
axes[1].set_title('Average Distance vs Temperature', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Temperature (¬∞F)')
axes[1].set_ylabel('Average Distance (miles)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Rain impact by mode
rain_impact = conn.execute("""
    SELECT
        ft.trip_type,
        CASE WHEN w.precipitation > 0 THEN 'Rain' ELSE 'No Rain' END as weather_condition,
        COUNT(*) as trip_count,
        AVG(ft.fare_amount) as avg_fare
    FROM core_core.fct_trips ft
    JOIN core_core.dim_weather w ON ft.weather_key = w.weather_key
    WHERE ft.weather_key IS NOT NULL
    GROUP BY ft.trip_type, weather_condition
    ORDER BY ft.trip_type, weather_condition
""").fetchdf()

rain_pivot = rain_impact.pivot(index='trip_type', columns='weather_condition', values='trip_count')

# Calculate percentage change
rain_pivot['Rain_Impact_%'] = ((rain_pivot['Rain'] - rain_pivot['No Rain']) / rain_pivot['No Rain'] * 100)

print("üåßÔ∏è Rain Impact by Transportation Mode\n")
print("="*80)
display(rain_pivot)

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
rain_pivot[['No Rain', 'Rain']].plot(kind='bar', ax=ax)
ax.set_title('Trip Volume: Rain vs No Rain', fontsize=14, fontweight='bold')
ax.set_ylabel('Number of Trips')
ax.set_xlabel('Transportation Mode')
ax.tick_params(axis='x', rotation=45)
ax.legend(['No Rain', 'Rain'])
plt.tight_layout()
plt.show()

---
## 5. Location Analysis

Top pickup and dropoff locations

In [None]:
# Top pickup locations
top_pickups = conn.execute("""
    SELECT
        l.zone_name,
        l.borough,
        COUNT(*) as trip_count,
        AVG(ft.fare_amount) as avg_fare
    FROM core_core.fct_trips ft
    JOIN core_core.dim_location l ON ft.location_key = l.location_key
    WHERE ft.location_key IS NOT NULL
    GROUP BY l.zone_name, l.borough
    ORDER BY trip_count DESC
    LIMIT 15
""").fetchdf()

print("üìç Top 15 Pickup Locations\n")
print("="*80)
display(top_pickups)

# Visualize top locations
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['#FF6B6B' if b == 'Manhattan' else '#4ECDC4' if b == 'Brooklyn' else '#95E1D3' 
          for b in top_pickups['borough']]
ax.barh(range(len(top_pickups)), top_pickups['trip_count'], color=colors)
ax.set_yticks(range(len(top_pickups)))
ax.set_yticklabels([f"{row['zone_name']} ({row['borough']})" 
                     for _, row in top_pickups.iterrows()], fontsize=9)
ax.set_xlabel('Number of Trips')
ax.set_title('Top 15 Pickup Locations', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Borough distribution
borough_stats = conn.execute("""
    SELECT
        l.borough,
        COUNT(*) as trip_count,
        AVG(ft.trip_distance_miles) as avg_distance,
        AVG(ft.fare_amount) as avg_fare
    FROM core_core.fct_trips ft
    JOIN core_core.dim_location l ON ft.location_key = l.location_key
    WHERE ft.location_key IS NOT NULL AND l.borough IS NOT NULL
    GROUP BY l.borough
    ORDER BY trip_count DESC
""").fetchdf()

print("üèôÔ∏è Trips by Borough\n")
display(borough_stats)

# Pie chart of borough distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Trip count pie chart
axes[0].pie(borough_stats['trip_count'], labels=borough_stats['borough'], 
            autopct='%1.1f%%', startangle=90)
axes[0].set_title('Trip Distribution by Borough', fontsize=14, fontweight='bold')

# Average fare by borough
axes[1].bar(borough_stats['borough'], borough_stats['avg_fare'])
axes[1].set_title('Average Fare by Borough', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Average Fare ($)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

---
## 6. MetricFlow Metrics

Query the semantic layer metrics defined in dbt

In [None]:
# List available metrics
import subprocess

print("üìä Querying MetricFlow Metrics\n")
print("="*80)

# List metrics
result = subprocess.run(
    ["poetry", "run", "mf", "list", "metrics"],
    cwd=PROJECT_ROOT / "dbt",
    capture_output=True,
    text=True
)

if result.returncode == 0:
    print("Available Metrics:")
    print(result.stdout)
else:
    print("‚ö†Ô∏è Could not list metrics. MetricFlow may not be configured.")
    print("You can still query the fact tables directly!")

In [None]:
# Query core trip metrics directly from fact table
trip_metrics = conn.execute("""
    SELECT
        trip_type,
        COUNT(*) as total_trips,
        SUM(trip_distance_miles) as total_distance_miles,
        SUM(fare_amount) as total_revenue,
        AVG(trip_distance_miles) as avg_distance,
        AVG(trip_duration_minutes) as avg_duration,
        AVG(fare_amount) as avg_fare,
        AVG(fare_amount / NULLIF(trip_distance_miles, 0)) as avg_fare_per_mile
    FROM core_core.fct_trips
    WHERE trip_distance_miles > 0
    GROUP BY trip_type
    ORDER BY total_trips DESC
""").fetchdf()

print("üéØ Core Trip Metrics by Mode\n")
print("="*80)
display(trip_metrics)

In [None]:
# Hourly mobility metrics
hourly_mobility = conn.execute("""
    SELECT
        hour_of_day,
        SUM(total_trips) as total_trips,
        AVG(avg_trip_distance) as avg_distance,
        AVG(avg_temperature) as avg_temp
    FROM core_core.fct_hourly_mobility
    GROUP BY hour_of_day
    ORDER BY hour_of_day
""").fetchdf()

print("‚è∞ Hourly Mobility Metrics\n")
print("="*80)
display(hourly_mobility)

# Visualize hourly mobility
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Trip volume
axes[0].plot(hourly_mobility['hour_of_day'], hourly_mobility['total_trips'], 
             marker='o', linewidth=2, markersize=6)
axes[0].set_title('Total Trips by Hour', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Number of Trips')
axes[0].grid(True, alpha=0.3)
axes[0].fill_between(hourly_mobility['hour_of_day'], hourly_mobility['total_trips'], alpha=0.3)

# Temperature and distance
ax2 = axes[1].twinx()
axes[1].plot(hourly_mobility['hour_of_day'], hourly_mobility['avg_distance'], 
             marker='s', linewidth=2, markersize=6, color='green', label='Avg Distance')
ax2.plot(hourly_mobility['hour_of_day'], hourly_mobility['avg_temp'], 
         marker='^', linewidth=2, markersize=6, color='orange', label='Avg Temp')
axes[1].set_title('Average Distance and Temperature by Hour', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Average Distance (miles)', color='green')
ax2.set_ylabel('Average Temperature (¬∞F)', color='orange')
axes[1].grid(True, alpha=0.3)
axes[1].legend(loc='upper left')
ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()

---
## 7. Advanced Analysis: Weather & Mode Share

How does weather affect the choice of transportation?

In [None]:
# Mode share by weather conditions
mode_share_weather = conn.execute("""
    WITH trip_counts AS (
        SELECT
            CASE
                WHEN w.precipitation > 0 THEN 'Rain'
                ELSE 'No Rain'
            END as weather_condition,
            ft.trip_type,
            COUNT(*) as trip_count
        FROM core_core.fct_trips ft
        JOIN core_core.dim_weather w ON ft.weather_key = w.weather_key
        WHERE ft.weather_key IS NOT NULL
        GROUP BY weather_condition, ft.trip_type
    ),
    totals AS (
        SELECT
            weather_condition,
            SUM(trip_count) as total_trips
        FROM trip_counts
        GROUP BY weather_condition
    )
    SELECT
        tc.weather_condition,
        tc.trip_type,
        tc.trip_count,
        ROUND(100.0 * tc.trip_count / t.total_trips, 2) as mode_share_pct
    FROM trip_counts tc
    JOIN totals t ON tc.weather_condition = t.weather_condition
    ORDER BY tc.weather_condition, tc.trip_count DESC
""").fetchdf()

print("üöó Mode Share by Weather Condition\n")
print("="*80)
display(mode_share_weather)

# Visualize mode share
mode_share_pivot = mode_share_weather.pivot(
    index='trip_type', 
    columns='weather_condition', 
    values='mode_share_pct'
)

fig, ax = plt.subplots(figsize=(10, 6))
mode_share_pivot.plot(kind='bar', ax=ax)
ax.set_title('Mode Share: Rain vs No Rain', fontsize=14, fontweight='bold')
ax.set_ylabel('Mode Share (%)')
ax.set_xlabel('Transportation Mode')
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Weather')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

---
## 8. Custom Analysis Examples

Template queries for your own analysis

In [None]:
# Example: Rush hour analysis
rush_hour_analysis = conn.execute("""
    SELECT
        CASE
            WHEN t.hour_of_day BETWEEN 7 AND 9 THEN 'Morning Rush (7-9 AM)'
            WHEN t.hour_of_day BETWEEN 17 AND 19 THEN 'Evening Rush (5-7 PM)'
            ELSE 'Off-Peak'
        END as time_period,
        ft.trip_type,
        COUNT(*) as trip_count,
        AVG(ft.trip_duration_minutes) as avg_duration,
        AVG(ft.fare_amount) as avg_fare
    FROM core_core.fct_trips ft
    JOIN core_core.dim_time t ON ft.time_key = t.time_key
    GROUP BY time_period, ft.trip_type
    ORDER BY time_period, trip_count DESC
""").fetchdf()

print("üö¶ Rush Hour vs Off-Peak Analysis\n")
print("="*80)
display(rush_hour_analysis)

In [None]:
# Example: Weekend vs Weekday patterns
weekend_analysis = conn.execute("""
    SELECT
        CASE WHEN d.is_weekend THEN 'Weekend' ELSE 'Weekday' END as day_type,
        ft.trip_type,
        COUNT(*) as trip_count,
        AVG(ft.trip_distance_miles) as avg_distance,
        AVG(ft.trip_duration_minutes) as avg_duration
    FROM core_core.fct_trips ft
    JOIN core_core.dim_date d ON ft.date_key = d.date_key
    GROUP BY day_type, ft.trip_type
    ORDER BY day_type, trip_count DESC
""").fetchdf()

print("üìÖ Weekend vs Weekday Patterns\n")
print("="*80)
display(weekend_analysis)

# Visualize
weekend_pivot = weekend_analysis.pivot(
    index='trip_type',
    columns='day_type',
    values='trip_count'
)

fig, ax = plt.subplots(figsize=(10, 6))
weekend_pivot.plot(kind='bar', ax=ax)
ax.set_title('Trip Volume: Weekend vs Weekday', fontsize=14, fontweight='bold')
ax.set_ylabel('Number of Trips')
ax.set_xlabel('Transportation Mode')
ax.tick_params(axis='x', rotation=45)
ax.legend(title='Day Type')
plt.tight_layout()
plt.show()

---
## 9. Data Quality Deep Dive

In [None]:
# Detailed data quality checks
quality_checks = conn.execute("""
    SELECT
        'Total Trips' as metric,
        COUNT(*)::VARCHAR as value
    FROM core_core.fct_trips
    
    UNION ALL
    
    SELECT
        'Trips with Weather Data' as metric,
        COUNT(*)::VARCHAR || ' (' || ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM core_core.fct_trips), 2)::VARCHAR || '%)' as value
    FROM core_core.fct_trips
    WHERE weather_key IS NOT NULL
    
    UNION ALL
    
    SELECT
        'Trips with Location Data' as metric,
        COUNT(*)::VARCHAR || ' (' || ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM core_core.fct_trips), 2)::VARCHAR || '%)' as value
    FROM core_core.fct_trips
    WHERE location_key IS NOT NULL
    
    UNION ALL
    
    SELECT
        'Trips with Invalid Distance (<=0)' as metric,
        COUNT(*)::VARCHAR || ' (' || ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM core_core.fct_trips), 2)::VARCHAR || '%)' as value
    FROM core_core.fct_trips
    WHERE trip_distance_miles <= 0
    
    UNION ALL
    
    SELECT
        'Trips with Invalid Duration (<=0)' as metric,
        COUNT(*)::VARCHAR || ' (' || ROUND(100.0 * COUNT(*) / (SELECT COUNT(*) FROM core_core.fct_trips), 2)::VARCHAR || '%)' as value
    FROM core_core.fct_trips
    WHERE trip_duration_minutes <= 0
""").fetchdf()

print("üîç Data Quality Deep Dive\n")
print("="*80)
display(quality_checks)

---
## 10. Export Analysis Results

Save key findings for reporting

In [None]:
# Export key metrics to CSV
export_dir = PROJECT_ROOT / "outputs"
export_dir.mkdir(exist_ok=True)

# Export mode statistics
mode_stats.to_csv(export_dir / "mode_statistics.csv", index=False)

# Export hourly patterns
hourly_pivot.to_csv(export_dir / "hourly_patterns.csv")

# Export weather impact
weather_impact.to_csv(export_dir / "weather_impact.csv", index=False)

# Export top locations
top_pickups.to_csv(export_dir / "top_pickup_locations.csv", index=False)

print(f"‚úÖ Analysis results exported to: {export_dir}")
print("\nExported files:")
for file in export_dir.glob("*.csv"):
    print(f"  - {file.name}")

---
## Summary

This notebook explored:
- ‚úÖ 12.5M trip records across 3 transportation modes
- ‚úÖ Temporal patterns (hourly, daily, weekly)
- ‚úÖ Weather impact on mobility
- ‚úÖ Location-based analysis
- ‚úÖ Mode share dynamics
- ‚úÖ Data quality validation

**Next Steps:**
- Customize queries for specific business questions
- Build dashboards using visualization tools (Streamlit, Tableau, etc.)
- Run additional analyses on specific time periods or locations
- Integrate findings into reports or presentations

In [None]:
# Close database connection
conn.close()
print("‚úÖ Database connection closed")