In [0]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")

# ============================================================================
# DATA LOADING AND PREPARATION
# ============================================================================

# Load data from your Databricks table
df = spark.table("workspace.default.uber_rides_enhanced")

# Convert to Pandas for easier visualization (for smaller datasets)
# For larger datasets, keep using Spark DataFrames
uber_df = df.toPandas()

# Data preprocessing
uber_df['tpep_pickup_datetime'] = pd.to_datetime(uber_df['tpep_pickup_datetime'])
uber_df['tpep_dropoff_datetime'] = pd.to_datetime(uber_df['tpep_dropoff_datetime'])
uber_df['trip_duration'] = (uber_df['tpep_dropoff_datetime'] - uber_df['tpep_pickup_datetime']).dt.total_seconds() / 60
uber_df['pickup_hour'] = uber_df['tpep_pickup_datetime'].dt.hour
uber_df['pickup_day'] = uber_df['tpep_pickup_datetime'].dt.day_name()
uber_df['pickup_month'] = uber_df['tpep_pickup_datetime'].dt.month_name()

print(f"Dataset shape: {uber_df.shape}")
print(f"Date range: {uber_df['tpep_pickup_datetime'].min()} to {uber_df['tpep_pickup_datetime'].max()}")

Dataset shape: (98574, 34)
Date range: 2016-03-01 00:00:00 to 2016-03-10 14:27:09


In [0]:
# ============================================================================
# CHART 1: TRIP DISTANCE DISTRIBUTION
# ============================================================================

def create_trip_distance_chart():
    fig = go.Figure()
    
    fig.add_trace(go.Histogram(
        x=uber_df['trip_distance'],
        nbinsx=50,
        name='Trip Distance',
        marker_color='skyblue',
        opacity=0.7
    ))
    
    fig.update_layout(
        title='Distribution of Trip Distances',
        xaxis_title='Trip Distance (miles)',
        yaxis_title='Frequency',
        showlegend=False,
        template='plotly_white',
        height=400
    )
    
    return fig

distance_chart = create_trip_distance_chart()
distance_chart.show()

In [0]:
# ============================================================================
# CHART 2: RIDES BY HOUR OF DAY
# ============================================================================

def create_hourly_rides_chart():
    hourly_rides = uber_df.groupby('pickup_hour').size().reset_index(name='ride_count')
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=hourly_rides['pickup_hour'],
        y=hourly_rides['ride_count'],
        mode='lines+markers',
        name='Rides per Hour',
        line=dict(color='orange', width=3),
        marker=dict(size=8)
    ))
    
    fig.update_layout(
        title='Uber Rides by Hour of Day',
        xaxis_title='Hour of Day',
        yaxis_title='Number of Rides',
        template='plotly_white',
        height=400
    )
    
    return fig

hourly_chart = create_hourly_rides_chart()
hourly_chart.show()

In [0]:
# ============================================================================
# CHART 3: PASSENGER COUNT ANALYSIS
# ============================================================================

def create_passenger_count_chart():
    passenger_stats = uber_df['passenger_count'].value_counts().sort_index()
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        x=passenger_stats.index,
        y=passenger_stats.values,
        name='Passenger Count',
        marker_color='lightcoral',
        text=passenger_stats.values,
        textposition='auto'
    ))
    
    fig.update_layout(
        title='Distribution of Passenger Count',
        xaxis_title='Number of Passengers',
        yaxis_title='Number of Rides',
        template='plotly_white',
        height=400
    )
    
    return fig

passenger_chart = create_passenger_count_chart()
passenger_chart.show()

In [0]:
# ============================================================================
# CHART 4: TRIP DURATION VS DISTANCE SCATTER PLOT
# ============================================================================

def create_duration_distance_scatter():
    # Filter out outliers for better visualization
    filtered_df = uber_df[
        (uber_df['trip_duration'] > 0) & 
        (uber_df['trip_duration'] < 120) &  # Less than 2 hours
        (uber_df['trip_distance'] < 50)     # Less than 50 miles
    ]
    
    fig = px.scatter(
        filtered_df, 
        x='trip_distance', 
        y='trip_duration',
        color='passenger_count',
        size='passenger_count',
        hover_data=['pickup_hour'],
        title='Trip Duration vs Distance',
        labels={
            'trip_distance': 'Trip Distance (miles)',
            'trip_duration': 'Trip Duration (minutes)',
            'passenger_count': 'Passengers'
        }
    )
    
    fig.update_layout(
        template='plotly_white',
        height=500
    )
    
    return fig

scatter_chart = create_duration_distance_scatter()
scatter_chart.show()

In [0]:
# ============================================================================
# CHART 5: VENDOR COMPARISON
# ============================================================================

def create_vendor_comparison():
    vendor_stats = uber_df.groupby('VendorID').agg({
        'trip_distance': 'mean',
        'trip_duration': 'mean',
        'passenger_count': 'mean'
    }).round(2)
    
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=('Avg Trip Distance', 'Avg Trip Duration', 'Avg Passengers'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}, {"secondary_y": False}]]
    )
    
    vendors = vendor_stats.index
    
    fig.add_trace(go.Bar(x=vendors, y=vendor_stats['trip_distance'], name='Distance', marker_color='blue'), row=1, col=1)
    fig.add_trace(go.Bar(x=vendors, y=vendor_stats['trip_duration'], name='Duration', marker_color='red'), row=1, col=2)
    fig.add_trace(go.Bar(x=vendors, y=vendor_stats['passenger_count'], name='Passengers', marker_color='green'), row=1, col=3)
    
    fig.update_layout(
        title='Vendor Performance Comparison',
        template='plotly_white',
        height=400,
        showlegend=False
    )
    
    return fig

vendor_chart = create_vendor_comparison()
vendor_chart.show()

In [0]:
# ============================================================================
# CHART 6: PICKUP HEATMAP BY DAY AND HOUR
# ============================================================================

def create_pickup_heatmap():
    # Create a pivot table for heatmap
    heatmap_data = uber_df.groupby(['pickup_day', 'pickup_hour']).size().reset_index(name='ride_count')
    heatmap_pivot = heatmap_data.pivot(index='pickup_day', columns='pickup_hour', values='ride_count').fillna(0)
    
    # Reorder days
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    heatmap_pivot = heatmap_pivot.reindex(day_order)
    
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_pivot.values,
        x=heatmap_pivot.columns,
        y=heatmap_pivot.index,
        colorscale='Viridis',
        showscale=True
    ))
    
    fig.update_layout(
        title='Pickup Patterns: Day of Week vs Hour',
        xaxis_title='Hour of Day',
        yaxis_title='Day of Week',
        template='plotly_white',
        height=400
    )
    
    return fig

heatmap_chart = create_pickup_heatmap()
heatmap_chart.show()

In [0]:
# ============================================================================
# COMPREHENSIVE DASHBOARD SUMMARY
# ============================================================================

def create_summary_dashboard():
    # Calculate key metrics
    total_rides = len(uber_df)
    avg_distance = uber_df['trip_distance'].mean()
    avg_duration = uber_df['trip_duration'].mean()
    total_distance = uber_df['trip_distance'].sum()
    
    print("=" * 60)
    print("UBER RIDES ANALYTICS DASHBOARD SUMMARY")
    print("=" * 60)
    print(f"Total Rides: {total_rides:,}")
    print(f"Average Trip Distance: {avg_distance:.2f} miles")
    print(f"Average Trip Duration: {avg_duration:.2f} minutes")
    print(f"Total Distance Covered: {total_distance:,.2f} miles")
    print(f"Most Popular Pickup Hour: {uber_df['pickup_hour'].mode()[0]}:00")
    print(f"Most Common Passenger Count: {uber_df['passenger_count'].mode()[0]}")
    print("=" * 60)

create_summary_dashboard()

UBER RIDES ANALYTICS DASHBOARD SUMMARY
Total Rides: 98,574
Average Trip Distance: 3.05 miles
Average Trip Duration: 16.98 minutes
Total Distance Covered: 301,033.23 miles
Most Popular Pickup Hour: 8:00
Most Common Passenger Count: 1


In [0]:
# ============================================================================
# SPARK SQL QUERIES FOR ADVANCED ANALYTICS
# ============================================================================

print("\n" + "=" * 60)
print("ADVANCED ANALYTICS WITH SPARK SQL")
print("=" * 60)

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("uber_rides")

# Query 1: Top 10 longest trips
print("\nTop 10 Longest Trips:")
longest_trips = spark.sql("""
    SELECT VendorID, trip_distance, passenger_count, 
           tpep_pickup_datetime, tpep_dropoff_datetime
    FROM uber_rides 
    ORDER BY trip_distance DESC 
    LIMIT 10
""")
longest_trips.show()

# Query 2: Average trip distance by vendor and passenger count
print("\nAverage Trip Distance by Vendor and Passenger Count:")
avg_by_vendor_passenger = spark.sql("""
    SELECT VendorID, passenger_count, 
           ROUND(AVG(trip_distance), 2) as avg_distance,
           COUNT(*) as trip_count
    FROM uber_rides 
    GROUP BY VendorID, passenger_count
    ORDER BY VendorID, passenger_count
""")
avg_by_vendor_passenger.show()

# Query 3: Busiest pickup locations (approximate using coordinates)
print("\nBusiest Pickup Areas (Rounded Coordinates):")
busiest_locations = spark.sql("""
    SELECT ROUND(pickup_latitude, 2) as lat_area, 
           ROUND(pickup_longitude, 2) as lon_area,
           COUNT(*) as pickup_count
    FROM uber_rides 
    GROUP BY ROUND(pickup_latitude, 2), ROUND(pickup_longitude, 2)
    ORDER BY pickup_count DESC
    LIMIT 10
""")
busiest_locations.show()


ADVANCED ANALYTICS WITH SPARK SQL

Top 10 Longest Trips:
+--------+-------------+---------------+--------------------+---------------------+
|VendorID|trip_distance|passenger_count|tpep_pickup_datetime|tpep_dropoff_datetime|
+--------+-------------+---------------+--------------------+---------------------+
|       1|        184.4|              2| 2016-03-01 06:14:14|  2016-03-01 07:07:41|
|       1|        160.8|              1| 2016-03-01 01:02:58|  2016-03-01 03:54:52|
|       2|        55.01|              1| 2016-03-10 13:59:29|  2016-03-10 16:25:37|
|       2|        51.27|              2| 2016-03-10 10:18:12|  2016-03-10 18:32:00|
|       2|        49.56|              1| 2016-03-10 11:37:01|  2016-03-10 13:54:55|
|       2|        48.18|              1| 2016-03-10 08:59:54|  2016-03-10 12:11:54|
|       2|         47.0|              1| 2016-03-10 13:33:43|  2016-03-10 14:35:05|
|       2|        45.68|              4| 2016-03-10 07:57:54|  2016-03-10 09:24:29|
|       2|        

In [0]:
# ============================================================================
# CREATING DATABRICKS DASHBOARD WIDGETS
# ============================================================================

print("\n" + "=" * 60)
print("DATABRICKS DASHBOARD SETUP")
print("=" * 60)

# Create widgets for interactive filtering
dbutils.widgets.dropdown("vendor_filter", "All", ["All", "1", "2"], "Select Vendor")
dbutils.widgets.text("min_distance", "0", "Minimum Trip Distance")
dbutils.widgets.multiselect("passenger_count_filter", "All", ["All", "1", "2", "3", "4", "5", "6"], "Passenger Count")

# Get widget values
vendor_filter = dbutils.widgets.get("vendor_filter")
min_distance = int(dbutils.widgets.get("min_distance"))
passenger_filter = dbutils.widgets.get("passenger_count_filter")

print(f"Applied Filters:")
print(f"- Vendor: {vendor_filter}")
print(f"- Minimum Distance: {min_distance}")
print(f"- Passenger Count: {passenger_filter}")


DATABRICKS DASHBOARD SETUP
Applied Filters:
- Vendor: All
- Minimum Distance: 0
- Passenger Count: All


In [0]:
# ============================================================================
# EXPORT FUNCTIONS FOR DASHBOARD CREATION
# ============================================================================

def save_charts_for_dashboard():
    """
    Save all charts as HTML files that can be embedded in Databricks dashboards
    """
    charts = {
        'trip_distance': create_trip_distance_chart(),
        'hourly_rides': create_hourly_rides_chart(),
        'passenger_count': create_passenger_count_chart(),
        'duration_distance': create_duration_distance_scatter(),
        'vendor_comparison': create_vendor_comparison(),
        'pickup_heatmap': create_pickup_heatmap()
    }
    
    for name, chart in charts.items():
        chart.write_html(f"/dbfs/tmp/{name}_chart.html")
        print(f"Saved {name} chart to /dbfs/tmp/{name}_chart.html")

# Uncomment to save charts
# save_charts_for_dashboard()

print("\n" + "=" * 60)
print("DASHBOARD CREATION COMPLETE!")
print("=" * 60)
print("Next Steps:")
print("1. Create a new Databricks Dashboard")
print("2. Add these visualizations as dashboard tiles")
print("3. Configure filters using the widgets created above")
print("4. Share the dashboard with your team")
print("=" * 60)


DASHBOARD CREATION COMPLETE!
Next Steps:
1. Create a new Databricks Dashboard
2. Add these visualizations as dashboard tiles
3. Configure filters using the widgets created above
4. Share the dashboard with your team
