## 1: Time series Analysis with Windows Functions

In [0]:
-- Advanced time series analysis using Databricks SQL
WITH daily_metrics AS (
    SELECT 
        DATE(tpep_pickup_datetime) as ride_date,
        COUNT(*) as daily_trips,
        SUM(total_amount) as daily_revenue,
        AVG(fare_amount) as daily_avg_fare,
        AVG(tip_percentage) as daily_avg_tip_pct
    FROM uber_rides_enhanced
    GROUP BY DATE(tpep_pickup_datetime)
),
time_series_analysis AS (
    SELECT 
        ride_date,
        daily_trips,
        daily_revenue,
        daily_avg_fare,
        
        -- Moving averages (7-day window)
        AVG(daily_revenue) OVER (
            ORDER BY ride_date 
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        ) as revenue_7day_ma,
        
        -- Revenue growth rate
        LAG(daily_revenue, 1) OVER (ORDER BY ride_date) as prev_day_revenue,
        ROUND(
            (daily_revenue - LAG(daily_revenue, 1) OVER (ORDER BY ride_date)) / 
            LAG(daily_revenue, 1) OVER (ORDER BY ride_date) * 100, 2
        ) as revenue_growth_pct,
        
        -- Rank days by performance
        RANK() OVER (ORDER BY daily_revenue DESC) as revenue_rank,
        RANK() OVER (ORDER BY daily_trips DESC) as volume_rank,
        
        -- Percentile analysis
        PERCENT_RANK() OVER (ORDER BY daily_revenue) as revenue_percentile,
        
        -- Cumulative metrics
        SUM(daily_revenue) OVER (
            ORDER BY ride_date 
            ROWS UNBOUNDED PRECEDING
        ) as cumulative_revenue
        
    FROM daily_metrics
)
SELECT 
    ride_date,
    daily_trips,
    ROUND(daily_revenue, 0) as daily_revenue,
    ROUND(revenue_7day_ma, 0) as revenue_7day_moving_avg,
    revenue_growth_pct,
    revenue_rank,
    ROUND(revenue_percentile * 100, 1) as revenue_percentile_score,
    ROUND(cumulative_revenue, 0) as cumulative_revenue,
    
    -- Performance classification
    CASE 
        WHEN revenue_percentile >= 0.9 THEN 'Top 10% Day'
        WHEN revenue_percentile >= 0.75 THEN 'Top 25% Day'
        WHEN revenue_percentile >= 0.5 THEN 'Above Average Day'
        WHEN revenue_percentile >= 0.25 THEN 'Average Day'
        ELSE 'Below Average Day'
    END as day_performance_tier
    
FROM time_series_analysis
ORDER BY ride_date;

ride_date,daily_trips,daily_revenue,revenue_7day_moving_avg,revenue_growth_pct,revenue_rank,revenue_percentile_score,cumulative_revenue,day_performance_tier
2016-03-01,22673,393959.0,393959.0,,2,0.0,393959.0,Below Average Day
2016-03-10,75901,1217607.0,805783.0,209.07,1,100.0,1611566.0,Top 10% Day


## 2. Geographic Clustering Analysis

In [0]:
-- Geographic clustering with statistical analysis
WITH location_clusters AS (
    SELECT 
        ROUND(pickup_latitude, 2) as lat_cluster,
        ROUND(pickup_longitude, 2) as lng_cluster,
        COUNT(*) as trip_count,
        SUM(total_amount) as cluster_revenue,
        AVG(fare_amount) as avg_fare,
        AVG(tip_percentage) as avg_tip_pct,
        AVG(trip_distance) as avg_distance,
        STDDEV(fare_amount) as fare_std_dev,
        
        -- Time-based patterns within clusters
        COUNT(CASE WHEN is_rush_hour = 1 THEN 1 END) as rush_hour_trips,
        COUNT(CASE WHEN is_weekend = 1 THEN 1 END) as weekend_trips,
        
        -- Advanced metrics
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY fare_amount) as median_fare,
        PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY fare_amount) as p95_fare
        
    FROM uber_rides_enhanced
    GROUP BY ROUND(pickup_latitude, 2), ROUND(pickup_longitude, 2)
    HAVING COUNT(*) >= 50  -- Focus on significant clusters
),
cluster_statistics AS (
    SELECT *,
        -- Statistical classification
        CASE 
            WHEN cluster_revenue >= PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY cluster_revenue) OVER () THEN 'Premium Zone'
            WHEN cluster_revenue >= PERCENTILE_CONT(0.7) WITHIN GROUP (ORDER BY cluster_revenue) OVER () THEN 'High-Value Zone'
            WHEN cluster_revenue >= PERCENTILE_CONT(0.3) WITHIN GROUP (ORDER BY cluster_revenue) OVER () THEN 'Standard Zone'
            ELSE 'Opportunity Zone'
        END as zone_classification,
        
        -- Rush hour dependency
        ROUND(rush_hour_trips * 100.0 / trip_count, 1) as rush_hour_dependency_pct,
        
        -- Weekend activity
        ROUND(weekend_trips * 100.0 / trip_count, 1) as weekend_activity_pct,
        
        -- Price volatility
        ROUND(fare_std_dev / avg_fare * 100, 1) as fare_volatility_pct
        
    FROM location_clusters
)
SELECT 
    lat_cluster,
    lng_cluster,
    trip_count,
    ROUND(cluster_revenue, 0) as revenue,
    ROUND(avg_fare, 2) as avg_fare,
    ROUND(median_fare, 2) as median_fare,
    ROUND(avg_tip_pct, 1) as avg_tip_pct,
    zone_classification,
    rush_hour_dependency_pct,
    weekend_activity_pct,
    fare_volatility_pct,
    
    -- Strategic insights
    CASE 
        WHEN rush_hour_dependency_pct >= 40 THEN 'Business District Pattern'
        WHEN weekend_activity_pct >= 30 THEN 'Entertainment District Pattern'
        WHEN fare_volatility_pct <= 15 THEN 'Stable Pricing Zone'
        ELSE 'Mixed-Use Zone'
    END as zone_pattern
    
FROM cluster_statistics
ORDER BY cluster_revenue DESC
LIMIT 25;

lat_cluster,lng_cluster,trip_count,revenue,avg_fare,median_fare,avg_tip_pct,zone_classification,rush_hour_dependency_pct,weekend_activity_pct,fare_volatility_pct,zone_pattern
40.75,-73.99,6335,89254.0,11.44,9.0,13.8,Premium Zone,31.4,0.0,71.5,Mixed-Use Zone
40.76,-73.98,5359,88579.0,13.12,9.5,15.1,Premium Zone,20.5,0.0,87.4,Mixed-Use Zone
40.76,-73.97,6071,87746.0,11.61,9.0,14.6,Premium Zone,24.8,0.0,77.6,Mixed-Use Zone
40.75,-73.98,5430,77956.0,11.4,8.5,15.2,Premium Zone,32.3,0.0,78.8,Mixed-Use Zone
40.76,-73.99,4767,72742.0,12.38,9.0,12.9,Premium Zone,28.7,0.0,83.8,Mixed-Use Zone
40.74,-73.99,4171,58472.0,11.19,9.0,15.9,Premium Zone,27.8,0.0,70.1,Mixed-Use Zone
40.77,-73.87,1178,51602.0,33.65,34.0,15.4,Premium Zone,28.7,0.0,36.8,Mixed-Use Zone
40.77,-73.96,3977,50944.0,10.4,8.5,13.5,Premium Zone,41.2,0.0,69.0,Business District Pattern
40.77,-73.86,956,46081.0,36.47,36.5,17.8,Premium Zone,31.9,0.0,26.8,Mixed-Use Zone
40.77,-73.98,2931,42810.0,11.83,9.5,14.0,High-Value Zone,30.4,0.0,75.7,Mixed-Use Zone
