# Feature Engineering—CITYWIDE DATA SET

In [1]:
import pandas as pd
import numpy as np
df = pd.read_feather(f'data/nyc_buses_with_passenger_counts_20210401_20210630_citywide_cleaned.feather')
df.shape

(4171122, 15)

## geographic transformations

#### as-is

#### use a distance variable

#### create clusters

In [2]:
# https://bmanikan.medium.com/feature-engineering-all-i-learned-about-geo-spatial-features-649871d16796

from sklearn.cluster import KMeans

def cluster(df):
  '''
  input: dataframe containing Latitude(x) and Longitude(y) coordinates
  output: series of cluster labels that each row of coordinates belongs to.
  '''
  model = KMeans(n_clusters=200)
  labels = model.fit_predict((df[['direction','lat', 'lon', 'next_stop_id_cleaned']]))
  return labels

df['cluster_id'] = cluster(df)

In [3]:
# df.plot(x="lat", y="lon", kind="scatter", c="cluster_id")

#### transformation of coordinates

In [4]:
# Option A (using UTM)
# https://gis.stackexchange.com/questions/212723/how-can-i-convert-lon-lat-coordinates-to-x-y/352085
import utm

def utm_transform(s):
    x, y, a, b = utm.from_latlon(s['lat'],s['lon'])
    return pd.Series([x,y],index=['utm_x','utm_y'])

df[['utm_x', 'utm_y']] = df.apply(utm_transform, axis=1, result_type="expand")

In [5]:
# # Option B (projected)
# # https://gis.stackexchange.com/questions/212723/how-can-i-convert-lon-lat-coordinates-to-x-y/352085

# from pyproj import Transformer
# transformer = Transformer.from_crs("epsg:4326", "epsg:2236")

# def pyproj_transform(s):
#     x, y = transformer.transform(s['lon'], s['lat'])
#     return pd.Series([x,y],index=['pyproj_x','pyproj_y'])

# df[['pyproj_x', 'pyproj_y']] = df.apply(pyproj_transform, axis=1, result_type="expand")

In [6]:
# TO DO?
# # https://bmanikan.medium.com/feature-engineering-all-i-learned-about-geo-spatial-features-649871d16796
# #  By rotating them, they would provide more spatial information for the Tree type models. which are extremely beneficial when compared to normal x-y coordinates. They help to visualize coordinates in different perception (viewing angle) and put some insights on the data that the model can learn from it. we can also do this rotation by Principal Component Analysis(PCA) which can give more options in our bucket. Will see it in a bit!

# def rotation(df):
#   '''
#   # most frequently used degrees are 30,45,60
#   input: dataframe containing Latitude(x) and Longitude(y)
#   '''
#   rot_45_x = (0.707 * df['lat']) + (0.707 * df['lon'])
#   rot_45_y = (0.707 * df['lon']) + (0.707 * df['lat'])
#   rot_30_x = (0.866 * df['lat']) + (0.5 * df['lon'])
#   rot_30_y = (0.866 * df['lon']) + (0.5 * df['lat'])
#   return rot_45_x, rot_45_y, rot_30_x, rot_30_y

# #TODO check that it was correct to swap x=lat and y=lon

# df[['rot_45_x', 'rot_45_y', 'rot_30_x', 'rot_30_y'] = 

#### TODO Combine route_id and direction into a single feature (for when we do citywide model)

## time series transformations

#### Month, week, and day of month

In [7]:
import datetime as dt

df['month'] = df.timestamp.dt.month
df['week'] = df.timestamp.dt.isocalendar().week.astype(int)
df['day'] = df.timestamp.dt.day

#### Day of week flag

In [8]:
df['service_date'] = pd.to_datetime(df['service_date'])
df['day_of_week'] = df['service_date'].dt.weekday
df['is_weekend'] = df['day_of_week'].isin([5, 6])

In [9]:
# # one-hot encode day of week
# df = pd.concat((df, pd.get_dummies(df['service_date'].dt.day_name())), axis=1)

#### Convert day of week to cyclical

In [10]:
df['day_of_week_sin'] = np.sin(df['day_of_week'] * (2 * np.pi / 7))
df['day_of_week_cos'] = np.cos(df['day_of_week'] * (2 * np.pi / 7))

#### Convert hour to cyclical

In [11]:
# cyclical time for hour

import math

# normalize
df["hour_norm"] = 2 * math.pi * df["hour"] / df["hour"].max()

# calc cyclical features
df["hour_cos"] = np.cos(df["hour_norm"])
df["hour_sin"] = np.sin(df["hour_norm"])

In [12]:
## Final Cleaning

#### drop columns

In [13]:
df = df.drop(['next_stop_id', 'timestamp','service_date', 'minute'], axis=1)

In [14]:
#### reorder columns

order = ['route_short',
         'vehicle_id',
         'month',
         'week',
         'day','hour',
         'day_of_week',
         'is_weekend',
         'day_of_week_sin',
         'day_of_week_cos',
         'hour_norm',
         'hour_cos',
         'hour_sin',
         'cluster_id',
         'lat',
         'lon',
         'direction',
         'next_stop_id_cleaned',
         'next_stop_d',
         'next_stop_d_along_route',
         'passenger_count',]

df = df.loc[:, order]

In [15]:
df.sample(n=25)

Unnamed: 0,route_short,vehicle_id,month,week,day,hour,day_of_week,is_weekend,day_of_week_sin,day_of_week_cos,...,hour_cos,hour_sin,cluster_id,lat,lon,direction,next_stop_id_cleaned,next_stop_d,next_stop_d_along_route,passenger_count
2369612,Q84,8415,5,21,30,18,6,True,-0.781831,0.62349,...,0.203456,-0.9790841,115,40.6864,-73.73,0,500477,0.0,8049.93,1.0
3442747,B17,7785,5,21,29,11,5,True,-0.974928,-0.222521,...,-0.990686,0.1361666,62,40.6313,-73.8935,1,301799,121.66,5695.88,3.0
2880917,S62,8585,5,20,17,16,0,False,0.0,1.0,...,-0.33488,-0.9422609,61,40.631,-74.0882,0,200551,205.26,11472.3,20.0
518813,B62,7216,5,21,29,20,5,True,-0.974928,-0.222521,...,0.682553,-0.730836,114,40.7304,-73.9543,1,305170,64.24,2966.1,2.0
2563293,Q30,8418,6,24,15,11,1,False,0.781831,0.62349,...,-0.990686,0.1361666,177,40.7256,-73.7917,0,504424,0.0,4092.68,2.0
4142154,B3,4857,5,21,29,19,5,True,-0.974928,-0.222521,...,0.460065,-0.8878852,70,40.6118,-73.9194,1,306209,127.21,1442.94,2.0
2759320,Q59,7225,6,22,6,20,6,True,-0.781831,0.62349,...,0.682553,-0.730836,165,40.7141,-73.9527,0,308272,471.81,3105.91,6.0
1148262,B36,7553,6,25,23,8,2,False,0.974928,-0.222521,...,-0.57668,0.8169699,157,40.5882,-73.9522,1,801092,206.06,2442.0,30.0
1071538,Bx3,8550,6,22,5,6,5,True,-0.974928,-0.222521,...,-0.068242,0.9976688,192,40.854,-73.9121,1,103532,244.85,4488.69,20.0
1305653,B52,7258,6,24,18,14,4,False,-0.433884,-0.900969,...,-0.775711,-0.6310879,181,40.689,-73.9812,1,302431,159.76,7193.45,0.0


In [16]:
df.columns

Index(['route_short', 'vehicle_id', 'month', 'week', 'day', 'hour',
       'day_of_week', 'is_weekend', 'day_of_week_sin', 'day_of_week_cos',
       'hour_norm', 'hour_cos', 'hour_sin', 'cluster_id', 'lat', 'lon',
       'direction', 'next_stop_id_cleaned', 'next_stop_d',
       'next_stop_d_along_route', 'passenger_count'],
      dtype='object')

In [17]:
df.dtypes

route_short                 object
vehicle_id                  object
month                        int64
week                         int64
day                          int64
hour                         int64
day_of_week                  int64
is_weekend                    bool
day_of_week_sin            float64
day_of_week_cos            float64
hour_norm                  float64
hour_cos                   float64
hour_sin                   float64
cluster_id                   int32
lat                        float64
lon                        float64
direction                    int64
next_stop_id_cleaned        object
next_stop_d                float64
next_stop_d_along_route    float64
passenger_count            float64
dtype: object

In [18]:
df['next_stop_id_cleaned'] = df['next_stop_id_cleaned'].astype(int)

### TODO future

# dump an engineered table

In [19]:
df.to_feather(f'data/nyc_buses_with_passenger_counts_20210401_20210630_citywide_features.feather')