# NYC Yellow Taxi EDA Starter Notebook

This notebook explores NYC yellow taxi trip data to understand rider behavior, pricing dynamics, and operational patterns. It focuses on data understanding and visualization to guide future modeling or dashboard efforts.

In [None]:
import pathlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

sns.set_theme(style="whitegrid", context="talk")
plt.rcParams["figure.figsize"] = (12, 6)

BASE = pathlib.Path.cwd()
RAW_PATH = BASE / "data" / "raw" / "nyc_taxi" / "yellow_tripdata_2024-01.parquet"
SAMPLE_PATH = BASE / "data" / "sample" / "yellow_taxi_sample.csv"
print(f"Raw dataset exists: {RAW_PATH.exists()}")
print(f"Using sample fallback: {SAMPLE_PATH}")
if RAW_PATH.exists():
    df = pd.read_parquet(RAW_PATH)
else:
    df = pd.read_csv(SAMPLE_PATH, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])

print(df.shape)
df.head()

In [None]:
df = df.copy()
df['trip_distance'] = df['trip_distance'].clip(lower=0)
df['trip_duration_minutes'] = (
    df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
).dt.total_seconds() / 60

# guard against zero division for speeds
df['avg_speed_mph'] = np.where(
    df['trip_duration_minutes'] > 0,
    df['trip_distance'] / (df['trip_duration_minutes'] / 60),
    np.nan,
)
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.day_name()
df['pickup_date'] = df['tpep_pickup_datetime'].dt.date
df['tip_rate'] = np.where(
    df['fare_amount'] > 0,
    df['tip_amount'] / df['fare_amount'],
    np.nan,
)

summary = df[
    ['trip_distance', 'trip_duration_minutes', 'fare_amount', 'tip_amount', 'total_amount', 'avg_speed_mph', 'tip_rate']
].describe(percentiles=[0.5, 0.75, 0.9, 0.95])
summary

## Trip Distance & Duration

Trip distance and duration distributions highlight the spread of short hops versus airport runs and help identify data quality issues (e.g., zero or extreme values).

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.histplot(df['trip_distance'], bins=40, kde=True, ax=axes[0], color="#1f77b4")
axes[0].set_title('Trip Distance Distribution')
axes[0].set_xlabel('Distance (miles)')
axes[0].set_ylabel('Trips')

sns.histplot(df['trip_duration_minutes'], bins=40, kde=True, ax=axes[1], color="#ff7f0e")
axes[1].set_title('Trip Duration Distribution')
axes[1].set_xlabel('Duration (minutes)')
axes[1].set_ylabel('Trips')
plt.tight_layout()
plt.show()

## Speed and Distance Relationship

Average speed can indicate traffic conditions and potential data outliers. Short trips with high speeds may hint at geospatial issues.

In [None]:
sns.jointplot(
    data=df.query('trip_distance <= 25'),
    x='trip_distance',
    y='avg_speed_mph',
    kind='hex',
    height=8,
    marginal_kws={'bins': 25, 'fill': False},
    color='#2ca02c'
)
plt.suptitle('Trip Distance vs. Average Speed (capped at 25 miles)', y=1.02)
plt.show()

## Fare Composition

Understanding how fares scale with distance and the contribution of extras (tips, tolls) helps with revenue forecasting and policy design.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
axes[0].hexbin(
    df['trip_distance'], df['fare_amount'], gridsize=35, cmap='viridis', mincnt=1
)
axes[0].set_title('Fare vs. Distance')
axes[0].set_xlabel('Distance (miles)')
axes[0].set_ylabel('Fare amount ($)')

fare_components = df[['fare_amount', 'tip_amount', 'tolls_amount', 'extra', 'congestion_surcharge']].sum()
fare_components.sort_values(ascending=False).plot(kind='bar', ax=axes[1], color='#9467bd')
axes[1].set_title('Total Fare Components')
axes[1].set_ylabel('Dollars collected')
plt.tight_layout()
plt.show()

## Tip Behavior by Payment Type

Cash and credit card payments often show different tipping behaviors. Comparing tip rates across payment types highlights rider preferences and potential leakage.

In [None]:
tip_stats = (
    df.groupby('payment_type')
      .agg(trip_count=('payment_type', 'size'),
           avg_tip_rate=('tip_rate', 'mean'),
           median_tip_rate=('tip_rate', 'median'))
      .sort_values('trip_count', ascending=False)
)

fig, ax = plt.subplots(figsize=(12, 6))

sns.barplot(
    x=tip_stats.index,
    y=tip_stats['avg_tip_rate'],
    ax=ax,
    palette='Blues_d'
)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{y:.0%}"))
ax.set_title('Average Tip Rate by Payment Type')
ax.set_xlabel('Payment Type')
ax.set_ylabel('Average tip rate')

for idx, val in enumerate(tip_stats['avg_tip_rate']):
    ax.text(idx, val + 0.005, f"{val:.0%}", ha='center', va='bottom')

plt.tight_layout()
plt.show()
tip_stats

## Temporal Patterns

Plotting trips across hours and weekdays reveals commuting peaks and weekend trends. Daily totals help track ridership changes or weather disruptions.

In [None]:
heatmap_data = (
    df.groupby(['pickup_weekday', 'pickup_hour'])
      .size()
      .reset_index(name='trips')
)
weekday_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
heatmap_pivot = heatmap_data.pivot(index='pickup_weekday', columns='pickup_hour', values='trips').reindex(weekday_order)

plt.figure(figsize=(16, 6))
sns.heatmap(heatmap_pivot, cmap='YlOrRd')
plt.title('Trips by Pickup Hour and Weekday')
plt.xlabel('Hour of day')
plt.ylabel('Weekday')
plt.tight_layout()
plt.show()

daily_trips = df.groupby('pickup_date').size()
plt.figure(figsize=(14, 4))
daily_trips.plot(marker='o', color='#d62728')
plt.title('Daily Trip Counts')
plt.xlabel('Date')
plt.ylabel('Trips')
plt.grid(True)
plt.tight_layout()
plt.show()

## Passenger & Zone Insights

Passenger counts and pickup locations highlight demand segments. Identifying top pickup zones can guide marketing or driver positioning.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=df, x='passenger_count', color='#8c564b')
ax.set_title('Passenger Count Distribution')
ax.set_xlabel('Passengers')
ax.set_ylabel('Trips')
plt.tight_layout()
plt.show()

zone_summary = (
    df.groupby('PULocationID')
      .agg(trips=('PULocationID', 'size'),
           median_fare=('fare_amount', 'median'),
           avg_distance=('trip_distance', 'mean'))
      .sort_values('trips', ascending=False)
      .head(10)
)
zone_summary

## Next Steps

* Investigate geo-spatial patterns by joining TLC zone shapefiles to the `PULocationID` and `DOLocationID` columns.
* Layer in weather data to explain daily ridership swings.
* Create derived features (e.g., airport flag, Manhattan flag) to support demand modeling.
* Validate assumptions against full historical data once available.