In [1]:
"""
Analyze K-Means clustering results for NYC Taxi data.

Outputs summary stats per cluster, top pickup zones,
and basic distributions to console.
"""

import pandas as pd
import numpy as np

# -----------------------------
# Load data
# -----------------------------
DATA_PATH = "datasets/clustered_taxi_data.csv"
LOOKUP_PATH = "datasets/taxi_zone_lookup.csv"

print("Loading data...")
df = pd.read_csv(DATA_PATH, low_memory=False)

# Compute tip_percent if missing
if "tip_percent" not in df.columns:
    df["tip_percent"] = np.where(df["fare_amount"] > 0,
                                 df["tip_amount"] / df["fare_amount"] * 100.0,
                                 np.nan)

# Try loading zone lookup
try:
    lookup = pd.read_csv(LOOKUP_PATH)
    lookup = lookup.rename(columns={"LocationID": "PULocationID"})
    lookup["PULocationID"] = lookup["PULocationID"].astype(int)
except FileNotFoundError:
    lookup = None

df["PULocationID"] = df["PULocationID"].astype("Int64")

print("Clusters present:", sorted(pd.unique(df["cluster"])))
print()

# -----------------------------
# Cluster sizes
# -----------------------------
sizes = df["cluster"].value_counts().sort_index()
print("=== Cluster Sizes ===")
print(sizes)
print()

# -----------------------------
# Cluster profiles (means)
# -----------------------------
num_cols = [
    "passenger_count", "trip_distance", "fare_amount",
    "tip_amount", "total_amount", "tip_percent"
]

profiles = df.groupby("cluster")[num_cols].mean().round(2)
print("=== Cluster Profiles (mean values) ===")
print(profiles)
print()

# -----------------------------
# Time distributions
# -----------------------------
hour_dist = (
    df.pivot_table(index="cluster", columns="pickup_hour", values="fare_amount", aggfunc="size", fill_value=0)
)
hour_dist = hour_dist.div(hour_dist.sum(axis=1), axis=0).mul(100).round(1)
print("=== Hour-of-Day Distribution (% within cluster) ===")
print(hour_dist)
print()

weekday_dist = (
    df.pivot_table(index="cluster", columns="pickup_weekday", values="fare_amount", aggfunc="size", fill_value=0)
)
weekday_dist = weekday_dist.div(weekday_dist.sum(axis=1), axis=0).mul(100).round(1)
print("=== Weekday Distribution (% within cluster) ===")
print(weekday_dist)
print()

# -----------------------------
# Top pickup zones per cluster
# -----------------------------
def top_pickup_zones(df_in, cluster_id, topn=10):
    sub = df_in[df_in["cluster"] == cluster_id].copy()
    if sub["PULocationID"].isna().all():
        print(f"Cluster {cluster_id}: no pickup zone data.\n")
        return
    top = (
        sub.groupby("PULocationID")
        .size()
        .reset_index(name="count")
        .sort_values("count", ascending=False)
        .head(topn)
    )
    if lookup is not None:
        top = top.merge(lookup[["PULocationID", "Borough", "Zone"]],
                        on="PULocationID", how="left")
    total = len(sub)
    top["share_pct"] = (top["count"] / total * 100).round(2)
    print(f"=== Top Pickup Zones: Cluster {cluster_id} ===")
    print(top)
    print()

for k in sorted(df["cluster"].unique()):
    top_pickup_zones(df, k)

# -----------------------------
# Simple correlation check
# -----------------------------
corr = df[num_cols].corr().round(2)
print("=== Overall Feature Correlations ===")
print(corr)
print()

print("Done.")



# -------------------------------------------------------------------------------
# MAKE SURE TO SHOW ALL OUTPUTS ("make scrollable")
# -------------------------------------------------------------------------------



Loading data...
Clusters present: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)]

=== Cluster Sizes ===
cluster
0    143856
1     77850
2     97176
3    156770
4    161439
5    130510
6     52732
7         3
Name: count, dtype: int64

=== Cluster Profiles (mean values) ===
         passenger_count  trip_distance  fare_amount  tip_amount  \
cluster                                                            
0                   1.12           1.82        12.71        2.68   
1                   1.49          14.99        63.75       11.36   
2                   1.18           2.65        15.13        2.49   
3                   1.17           2.07        13.83        2.69   
4                   1.14           1.68        12.15        2.55   
5                   1.15           2.74        16.37        2.84   
6                   3.77           2.35        15.59        2.64   
7                   1.00        2446.67        40.33    