In [1]:
from datetime import datetime
import requests
import pandas as pd
import lightgbm as lgb

from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error


# set current working directory to project root
import os
os.chdir('..')
from src.paths import TRANSFORMED_DATA_DIR, RAW_DATA_DIR
from src.data_split import train_test_split
from src.data import download_and_load_nyc_taxi_zone_data

In [2]:
# load dataset
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,pickup_hour,pickup_location_id,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target_rides_next_hour
0,2022-01-29,4,11,15.0,26.0,8.0,9.0,7.0,3.0,1.0,...,10.0,4.0,11.0,7.0,4.0,3.0,4.0,9.0,19.0,17.0
1,2022-01-30,4,1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,3.0,5.0,5.0,4.0,10.0,7.0,5.0,9.0,10.0,9.0
2,2022-01-31,4,0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,...,13.0,6.0,8.0,7.0,8.0,5.0,5.0,10.0,0.0,3.0
3,2022-02-01,4,1,1.0,0.0,0.0,0.0,3.0,2.0,3.0,...,3.0,6.0,3.0,16.0,7.0,1.0,0.0,1.0,3.0,3.0
4,2022-02-02,4,0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,2.0,5.0,3.0,8.0,3.0,0.0,4.0,4.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80167,2022-11-26,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80168,2022-11-27,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80169,2022-11-28,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80170,2022-11-29,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# train test split
X_train, y_train, X_test, y_test = train_test_split(df, datetime(2022, 6, 1, 0, 0, 0), 'target_rides_next_hour')

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(47946, 674)
y_test.shape=(47946,)


In [4]:
# function that averages rides from previous 7, 14, 21, 28 days
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    '''Adds a feature column calculated by averaging rides from
    - previous 7 days
    - previous 14 days
    - previous 21 days
    - previous 28 days
    '''

    X_ = X.copy()
    X_['average_rides_last_4_weeks'] = X_[[f'rides_previous_{7*24}_hour', f'rides_previous_{14*24}_hour', f'rides_previous_{21*24}_hour', f'rides_previous_{28*24}_hour']].mean(axis=1)

    return X_

# convert function to sklearn transformer
add_feature_average_rides_last_4_weeks = FunctionTransformer(average_rides_last_4_weeks, validate=False)

In [5]:
def ex_tract_temporal_features(X: pd.DataFrame) -> pd.DataFrame:
    '''Ex_tracts temporal features from the datetime index_'''

    X_ = X.copy()
    X_['hour'] = X_.pickup_hour.dt.hour
    X_['day'] = X_.pickup_hour.dt.day
    X_['month'] = X_.pickup_hour.dt.month
    X_['weekday'] = X_.pickup_hour.dt.weekday
    X_['weekend'] = X_['weekday'].isin([5, 6]).astype(int)
    X_.drop('pickup_hour', axis=1, inplace=True)
    
    return X_

# convert function to sklearn transformer
add_temporal_features = FunctionTransformer(ex_tract_temporal_features, validate=False)

In [6]:
# download and load latitude and longitude data
nyc_zone_data = download_and_load_nyc_taxi_zone_data()
nyc_zone_data

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MULTIPOLYGON (((-74.18445299999996 40.69499599...,0.000782,Newark Airport,1,EWR
1,2,0.433470,MULTIPOLYGON (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens
2,3,0.084341,MULTIPOLYGON (((-73.84792614099985 40.87134223...,0.000314,Allerton/Pelham Gardens,3,Bronx
3,4,0.043567,MULTIPOLYGON (((-73.97177410965318 40.72582128...,0.000112,Alphabet City,4,Manhattan
4,5,0.092146,MULTIPOLYGON (((-74.17421738099989 40.56256808...,0.000498,Arden Heights,5,Staten Island
...,...,...,...,...,...,...,...
258,256,0.067915,MULTIPOLYGON (((-73.95834207500002 40.71330630...,0.000169,Williamsburg (South Side),256,Brooklyn
259,259,0.126750,MULTIPOLYGON (((-73.85107116191898 40.91037152...,0.000395,Woodlawn/Wakefield,259,Bronx
260,260,0.133514,MULTIPOLYGON (((-73.90175373399988 40.76077547...,0.000422,Woodside,260,Queens
261,261,0.027120,MULTIPOLYGON (((-74.01332610899988 40.70503078...,0.000034,World Trade Center,261,Manhattan


In [7]:
# function to extract latitude & longitude from new york city location id data
def extract_lat_lon(df: pd.DataFrame) -> pd.DataFrame:
    '''Extracts latitude and longitude from the_geom column'''
    
    df_ = df.copy()
    # extract latitude and longitude data from the_geom column
    df_['the_geom'] = df_['the_geom'].str.replace('MULTIPOLYGON \\(\\(\\(', '', regex=True).str.replace('\\)\\)\\)', '', regex=True).str.replace(',', '', regex=True).str.replace(' ', ',', regex=True).str.replace('\\)\\)', '', regex=True).str.replace('\\(\\(', '', regex=True)

    # convert string to float
    df_['the_geom'] = df_['the_geom'].apply(lambda x: [float(i) for i in x.split(',')])

    # convert list of floats to list of tuples
    df_['the_geom'] = df_['the_geom'].apply(lambda x: list(zip(x[::2], x[1::2])))

    # take average of tuples to get center of zone
    df_['the_geom'] = df_['the_geom'].apply(lambda x: (sum([i[0] for i in x])/len(x), sum([i[1] for i in x])/len(x)))

    df_['latitude'] = df_['the_geom'].apply(lambda x: x[0])
    df_['longitude'] = df_['the_geom'].apply(lambda x: x[1])
    df_.drop(['the_geom', 'OBJECTID', 'Shape_Leng', 'Shape_Area', 'zone', 'borough'], axis=1, inplace=True)
    
    return df_

nyc_zone_data = extract_lat_lon(nyc_zone_data)

In [8]:
# convert function to sklearn transformer
class add_latitude_and_longitude_features(BaseEstimator, TransformerMixin):
    """add latitude and longitude features to the dataframe"""

    def __init__(self, df_lat_lon: pd.DataFrame):
        self.df_lat_lon = df_lat_lon

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        X_ = X.copy()
        
        # merge latitude and longitude data with X
        X_ = X_.merge(self.df_lat_lon, how='left', left_on='pickup_location_id', right_on='LocationID')

        # drop LocationID column
        X_.drop('LocationID', axis=1, inplace=True)

        # rename columns
        X_.rename(columns={'latitude': 'pickup_latitude', 'longitude': 'pickup_longitude'}, inplace=True)

        return X_

# Create pipeline

In [13]:
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    add_latitude_and_longitude_features(nyc_zone_data),
    lgb.LGBMRegressor()
)
pipeline.fit(X_train, y_train)

In [14]:
predictions = pipeline.predict(X_test)


test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5597
