In [8]:
import pandas as pd
AGG_PATH = "../../data/processed/nyc_demand_zone_hour_2019_q1.parquet"

# Load clean snapshot
df = pd.read_parquet(AGG_PATH)
df.shape

(296807, 10)

In [9]:
df.head()

Unnamed: 0,zone_id,pickup_hour_ts,demand,avg_fare,avg_distance,hour,day_of_week,is_weekend,day,month
0,1,2019-01-01 10:00:00,2,61.25,16.9,10,1,0,1,1
1,1,2019-01-01 12:00:00,1,135.0,19.3,12,1,0,1,1
2,1,2019-01-01 15:00:00,1,106.0,41.28,15,1,0,1,1
3,1,2019-01-02 02:00:00,1,30.0,1.27,2,2,0,2,1
4,1,2019-01-02 03:00:00,1,15.0,12.65,3,2,0,2,1


### Select features for clustering
We select numerical and behavioural features that describe demand patterns at the zoneâ€“hour level. Identifiers and timestamps are excluded.

In [10]:
features = [
    "demand",
    "avg_fare",
    "avg_distance",
    "hour",
    "day_of_week",
    "is_weekend"
]

X = df[features].copy()
X.head()

Unnamed: 0,demand,avg_fare,avg_distance,hour,day_of_week,is_weekend
0,2,61.25,16.9,10,1,0
1,1,135.0,19.3,12,1,0
2,1,106.0,41.28,15,1,0
3,1,30.0,1.27,2,2,0
4,1,15.0,12.65,3,2,0


### Check features scales
Before normalization we inspect the scale of each feature to confirm that they are not comparable and require scaling.

In [11]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
demand,296807.0,73.795578,140.608946,1.0,1.0,6.0,75.0,1390.0
avg_fare,296807.0,18.929249,15.091222,0.01,10.333333,13.012195,22.0,412.0
avg_distance,296807.0,4.973577,4.633993,0.01,2.169503,3.136,5.8575,83.61
hour,296807.0,11.758732,6.541957,0.0,7.0,12.0,17.0,23.0
day_of_week,296807.0,3.038338,1.973792,0.0,1.0,3.0,5.0,6.0
is_weekend,296807.0,0.285118,0.451471,0.0,0.0,0.0,1.0,1.0


We can see that the features have very different scales. If we do clustering without normalization demand will dominate everything.

### Fit the scaler
We apply standard scaling so that all features contribute equally to distance-based clustering.

x_scaled = (x - mean(x)) / std(x)

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_scaled[0]

array([-0.51060548,  2.80433372,  2.57368633, -0.2688393 , -1.03270344,
       -0.63153158])

The scaled feature matrix is converted back to a DataFrame to preserve feature names.

In [16]:
X_scaled_df = pd.DataFrame(
    X_scaled,
    columns=features,
    index=df.index
)

X_scaled_df.head()

Unnamed: 0,demand,avg_fare,avg_distance,hour,day_of_week,is_weekend
0,-0.510605,2.804334,2.573686,-0.268839,-1.032703,-0.631532
1,-0.517717,7.691289,3.091599,0.03688,-1.032703,-0.631532
2,-0.517717,5.769639,7.834817,0.495459,-1.032703,-0.631532
3,-0.517717,0.73359,-0.799221,-1.491717,-0.526064,-0.631532
4,-0.517717,-0.260367,1.656549,-1.338857,-0.526064,-0.631532


In [17]:
X_scaled_df.mean()

demand          1.225705e-17
avg_fare       -8.886363e-17
avg_distance   -8.579936e-17
hour           -3.523902e-17
day_of_week    -8.685270e-17
is_weekend      3.466447e-17
dtype: float64

In [18]:
X_scaled_df.std()

demand          1.000002
avg_fare        1.000002
avg_distance    1.000002
hour            1.000002
day_of_week     1.000002
is_weekend      1.000002
dtype: float64

### Save the normalized features
The normalized feature matrix is saved and used as input for all clustering experiments.

In [19]:
X_scaled_df.to_parquet(
    "../../data/processed/clustering_features_scaled.parquet"
)