# Feature Engineering—CITYWIDE DATA SET

In [1]:
import pandas as pd
import numpy as np
df = pd.read_feather(f'data/nyc_buses_with_passenger_counts_20210401_20210630_citywide_cleaned.feather')
df.shape

(4171122, 15)

## geographic transformations

#### as-is

#### use a distance variable

#### create clusters

In [None]:
# https://bmanikan.medium.com/feature-engineering-all-i-learned-about-geo-spatial-features-649871d16796

from sklearn.cluster import KMeans

def cluster(df):
  '''
  input: dataframe containing Latitude(x) and Longitude(y) coordinates
  output: series of cluster labels that each row of coordinates belongs to.
  '''
  model = KMeans(n_clusters=200)
  labels = model.fit_predict((df[['direction','lat', 'lon', 'next_stop_id_cleaned']]))
  return labels

df['cluster_id'] = cluster(df)

In [None]:
# df.plot(x="lat", y="lon", kind="scatter", c="cluster_id")

#### transformation of coordinates

In [None]:
# Option A (using UTM)
# https://gis.stackexchange.com/questions/212723/how-can-i-convert-lon-lat-coordinates-to-x-y/352085
import utm

def utm_transform(s):
    x, y, a, b = utm.from_latlon(s['lat'],s['lon'])
    return pd.Series([x,y],index=['utm_x','utm_y'])

df[['utm_x', 'utm_y']] = df.apply(utm_transform, axis=1, result_type="expand")

In [None]:
# # Option B (projected)
# # https://gis.stackexchange.com/questions/212723/how-can-i-convert-lon-lat-coordinates-to-x-y/352085

# from pyproj import Transformer
# transformer = Transformer.from_crs("epsg:4326", "epsg:2236")

# def pyproj_transform(s):
#     x, y = transformer.transform(s['lon'], s['lat'])
#     return pd.Series([x,y],index=['pyproj_x','pyproj_y'])

# df[['pyproj_x', 'pyproj_y']] = df.apply(pyproj_transform, axis=1, result_type="expand")

In [None]:
# TO DO?
# # https://bmanikan.medium.com/feature-engineering-all-i-learned-about-geo-spatial-features-649871d16796
# #  By rotating them, they would provide more spatial information for the Tree type models. which are extremely beneficial when compared to normal x-y coordinates. They help to visualize coordinates in different perception (viewing angle) and put some insights on the data that the model can learn from it. we can also do this rotation by Principal Component Analysis(PCA) which can give more options in our bucket. Will see it in a bit!

# def rotation(df):
#   '''
#   # most frequently used degrees are 30,45,60
#   input: dataframe containing Latitude(x) and Longitude(y)
#   '''
#   rot_45_x = (0.707 * df['lat']) + (0.707 * df['lon'])
#   rot_45_y = (0.707 * df['lon']) + (0.707 * df['lat'])
#   rot_30_x = (0.866 * df['lat']) + (0.5 * df['lon'])
#   rot_30_y = (0.866 * df['lon']) + (0.5 * df['lat'])
#   return rot_45_x, rot_45_y, rot_30_x, rot_30_y

# #TODO check that it was correct to swap x=lat and y=lon

# df[['rot_45_x', 'rot_45_y', 'rot_30_x', 'rot_30_y'] = 

#### TODO Combine route_id and direction into a single feature (for when we do citywide model)

## time series transformations

#### Month, week, and day of month

In [None]:
import datetime as dt

df['month'] = df.timestamp.dt.month
df['week'] = df.timestamp.dt.isocalendar().week.astype(int)
df['day'] = df.timestamp.dt.day

#### Day of week flag

In [None]:
df['service_date'] = pd.to_datetime(df['service_date'])
df['day_of_week'] = df['service_date'].dt.weekday
df['is_weekend'] = df['day_of_week'].isin([5, 6])

In [None]:
# # one-hot encode day of week
# df = pd.concat((df, pd.get_dummies(df['service_date'].dt.day_name())), axis=1)

#### Convert day of week to cyclical

In [None]:
df['day_of_week_sin'] = np.sin(df['day_of_week'] * (2 * np.pi / 7))
df['day_of_week_cos'] = np.cos(df['day_of_week'] * (2 * np.pi / 7))

#### Convert hour to cyclical

In [None]:
# cyclical time for hour

import math

# normalize
df["hour_norm"] = 2 * math.pi * df["hour"] / df["hour"].max()

# calc cyclical features
df["hour_cos"] = np.cos(df["hour_norm"])
df["hour_sin"] = np.sin(df["hour_norm"])

In [None]:
## Final Cleaning

#### drop columns

In [None]:
df = df.drop(['next_stop_id', 'timestamp','service_date', 'minute'], axis=1)

In [None]:
#### reorder columns

order = ['route_short',
         'vehicle_id',
         'month',
         'week',
         'day','hour',
         'day_of_week',
         'is_weekend',
         'day_of_week_sin',
         'day_of_week_cos',
         'hour_norm',
         'hour_cos',
         'hour_sin',
         'cluster_id',
         'lat',
         'lon',
         'direction',
         'next_stop_id_cleaned',
         'next_stop_d',
         'next_stop_d_along_route',
         'passenger_count',]

df = df.loc[:, order]

In [None]:
df.sample(n=25)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df['next_stop_id_cleaned'] = df['next_stop_id_cleaned'].astype(int)

### TODO future

# dump an engineered table

In [None]:
df.to_feather(f'data/nyc_buses_with_passenger_counts_20210401_20210630_citywide_features.feather')