In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import geojson
import plotly.express as px
import plotly.io as pio
import json

In [None]:
def load_trips():
    df = pd.DataFrame()
    for month in range(1, 13):
        trips = pq.read_table(f'data/yellow_tripdata_2022-{month:02}.parquet')
        trips_df = trips.to_pandas()
        # Append data
        df = pd.concat((df, trips_df), ignore_index=True)# df.concat(trips_df, ignore_index=True)

    return df

#df = load_trips()
#df.to_csv('data/yellow_tripdata_combined.csv')
df = pd.read_csv('data/yellow_tripdata_combined.csv')

In [None]:
# Process some data
df["pickup_at"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["dropoff_at"] = pd.to_datetime(df["tpep_dropoff_datetime"])
df

In [None]:
with open("data/nyc-taxi-zones.geojson") as f:
    taxi_zones = json.load(f)

In [None]:
pickups = df.groupby("PULocationID").size().reset_index()
pickups.columns = ["PULocationID", "Pickups"]
pickups

# TODO: Add a column for the zone name in properties.zone
# Extract both zone names and location IDs
zones = pd.DataFrame(taxi_zones["features"]).apply(lambda x: x["properties"]["zone"], axis=1)
locations = pd.DataFrame(taxi_zones["features"]).apply(lambda x: x["properties"]["location_id"], axis=1)
# Merge the two dataframes
zones = pd.concat((zones, locations), axis=1)
zones.columns = ["zone", "location_id"]

zones["location_id"] = zones["location_id"].astype(int)
# Add zone name to pickups
pickups = pd.merge(pickups, zones, left_on="PULocationID", right_on="location_id")

# Make timestamps for pickups
pickups


In [None]:
# Make map
fig = px.choropleth_mapbox(
    pickups,
    geojson=taxi_zones,
    locations="PULocationID",
    color="Pickups",
    color_continuous_scale="Viridis",
    featureidkey="properties.location_id",
    range_color=(pickups["Pickups"].min(), pickups["Pickups"].max()),
    mapbox_style="carto-positron",
    zoom=9,
    center = {"lat": 40.7128, "lon": -74.0060},
    opacity=0.5,
    labels={'Pickups':'Number of pickups', 'PULocationID':'Pickup location ID'},
    hover_name="zone",
)

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

#fig.show()
pio.write_html(fig, file="map_plot.html")

In [None]:
df.groupby([df["pickup_at"].dt.hour]).size().plot(kind="bar", title="Pickups by hour", xlabel="Hour", ylabel="Number of pickups")

In [None]:
# In range 0 - 50 usd
df["tip_amount"].plot(kind="hist", title="Tip amount", xlabel="Tip amount", ylabel="Number of pickups", bins=50, range=(0, 20))

In [None]:
# Scatter plot of distance and fare amount of the first 1000 trips
first_trips = df[:10000]

# Include trend
first_trips.plot(kind="scatter", x="trip_distance", y="fare_amount", title="Distance vs. fare", xlabel="Distance", ylabel="Fare")

#df.plot(kind="scatter", x="trip_distance", y="fare_amount", title="Distance vs. fare", xlabel="Distance", ylabel="Fare")

In [None]:
# Interesting, that some of the fares are negative. This could be due to disputes or refunds. 
first_trips[first_trips["fare_amount"] < 0]

In [None]:
# Make boxplots of fare amount and tip amount

first_trips.boxplot(column=["fare_amount", "tip_amount"], grid=False, showfliers=True, showmeans=True, meanline=True, vert=False, labels=["Fare amount", "Tip amount"])

In [None]:
first_trips["tip_amount"].max()