In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import contextily as ctx
import geopandas as gpd
from pyathena import connect
gdf = gpd.read_file("./taxi_zones.zip")

def traffic_analysis(date, borough=None, brand=None, n_clusters=3):

    year = pd.to_datetime(date).year

    conn = connect(aws_access_key_id=AWS_ACCESS_KEY,
                   aws_secret_access_key=AWS_SECRET_KEY,
                   s3_staging_dir='s3://fireflychick/Results/',
                   region_name=AWS_REGION)

    def plot(df, date, borough=None, n_clusters=3):

        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
        # df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

        filtered_df = df[df['pickup_datetime'].dt.date == pd.to_datetime(date).date()]

        daily_aggregate_driver_pay = filtered_df['driver_pay'].sum()
        daily_aggregate_miles = filtered_df['trip_miles'].sum()
        
        print(f"Daily Aggregate Revenue on {date}: ${daily_aggregate_driver_pay:.2f}")
        print(f"Daily Aggregate Miles Covered on {date}: {daily_aggregate_miles:.2f} miles")

        if borough is None:
            fig, ax = plt.subplots(figsize=(20, 20))
            gdf.plot(ax=ax, alpha=0.4, edgecolor='k')
            ctx.add_basemap(ax, crs=gdf.crs.to_string(), source=ctx.providers.CartoDB.Positron)

            plt.show()
        
        if borough:
            filtered_df = filtered_df[(filtered_df['puborough'] == borough) | (filtered_df['doborough'] == borough)]
        
        if borough and gdf is not None:
            borough_zones = gdf[gdf['borough'] == borough]
            fig, ax = plt.subplots(figsize=(10, 10))
            borough_zones.plot(ax=ax, alpha=0.5, edgecolor='k')
            ctx.add_basemap(ax, crs=borough_zones.crs.to_string(), source=ctx.providers.CartoDB.Positron)
            ax.set_title(f"{borough} Taxi Zones")
            plt.show()
        
        filtered_df['hour'] = filtered_df['pickup_datetime'].dt.hour
        hourly_traffic = filtered_df.groupby('hour').size().reset_index(name='trip_counts')
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=hourly_traffic, x='hour', y='trip_counts', color='skyblue')
        plt.title(f'Hourly Traffic Volume on {date} {"in " + borough if borough else "New York City"}')
        plt.xlabel('Hour of the Day')
        plt.ylabel('Number of Trips')
        plt.xticks(range(0, 24))
        plt.grid(axis='y')
        plt.show()
        
        most_common_pu_zone = filtered_df['puzone'].value_counts().idxmax()
        most_common_do_zone = filtered_df['dozone'].value_counts().idxmax()
        print(f"Most common pickup zone: {most_common_pu_zone}")
        print(f"Most common drop-off zone: {most_common_do_zone}")

        day_name = pd.to_datetime(date).day_name()
        
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(hourly_traffic[['hour', 'trip_counts']])
        
        kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        kmeans.fit(scaled_features)
        hourly_traffic['cluster'] = kmeans.labels_

        def format_hour_ampm(hour):

            if hour == 0:
                return '12 AM'
            elif hour < 12:
                return f'{hour} AM'
            elif hour == 12:
                return '12 PM'
            else:
                return f'{hour - 12} PM'
        
        hourly_traffic['hour_ampm'] = hourly_traffic['hour'].apply(format_hour_ampm)

        plt.figure(figsize=(12, 7))
        custom_palette = ['#d62728', '#2ca02c', '#1f77b4']
        sns.scatterplot(x='hour_ampm', y='trip_counts', data=hourly_traffic, hue='cluster', palette=custom_palette, s=100)
        plt.title(f"Peak Traffic Clusters on {date} {day_name}")
        plt.xlabel('Time of Day')
        plt.ylabel('Number of Trips')
        plt.xticks(rotation=45)
        plt.legend(title='Cluster')
        plt.grid(True)
        plt.show()
        
        cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
        for i, center in enumerate(cluster_centers):
            print(f"Cluster {i} on {date} {day_name}: Peak around {format_hour_ampm(int(center[0]))} with {int(center[1])} trips")

    if brand == 'Uber':
        table_name = f"uber_{year}"
    elif brand == 'Lyft':
        table_name = f"lyft_{year}"
    else:
        table_name = f"uber+lyft_{year}"

    conn = connect(aws_access_key_id=AWS_ACCESS_KEY,
                   aws_secret_access_key=AWS_SECRET_KEY,
                   s3_staging_dir='s3://fireflychick/Results/',
                   region_name=AWS_REGION)

    query = f"SELECT * FROM fireflychick.{table_name}"
    df = pd.read_sql_query(query, conn)

    if borough:
        df = df[(df['puborough'] == borough) | (df['doborough'] == borough)]

    plot(df, date, borough, n_clusters)

DriverError: '/vsizip/./taxi_zones.zip' does not exist in the file system, and is not recognized as a supported dataset name.

In [None]:
# date format is YYYY-MM-DD
traffic_analysis('2021-12-05','Brooklyn', 'Uber', 3)