In [1]:
import pandas as pd

# set current working directory to project root
import os
os.chdir(os.path.dirname(os.getcwd()))
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,pickup_hour,pickup_location_id,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,...,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,target_rides_next_hour
0,2022-01-29,4,11,15.0,26.0,8.0,9.0,7.0,3.0,1.0,...,10.0,4.0,11.0,7.0,4.0,3.0,4.0,9.0,19.0,17.0
1,2022-01-30,4,1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,3.0,5.0,5.0,4.0,10.0,7.0,5.0,9.0,10.0,9.0
2,2022-01-31,4,0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,...,13.0,6.0,8.0,7.0,8.0,5.0,5.0,10.0,0.0,3.0
3,2022-02-01,4,1,1.0,0.0,0.0,0.0,3.0,2.0,3.0,...,3.0,6.0,3.0,16.0,7.0,1.0,0.0,1.0,3.0,3.0
4,2022-02-02,4,0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,2.0,5.0,3.0,8.0,3.0,0.0,4.0,4.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80167,2022-11-26,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80168,2022-11-27,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80169,2022-11-28,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80170,2022-11-29,199,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df, datetime(2022, 6, 1, 0, 0, 0), 'target_rides_next_hour')

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')


X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(47946, 674)
y_test.shape=(47946,)


# Baseline Model 1
- Actual demand observed in the previous hour

In [3]:
import numpy as np

# LOCF baseline model
class BaselineModelLOCF:
    '''Prediction = actual demand observed in the previous hour'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f'rides_previous_1_hour']

In [4]:
model = BaselineModelLOCF()
predictions = model.predict(X_test)
predictions

0         0.0
1         5.0
2        13.0
3        12.0
4        14.0
         ... 
47941     0.0
47942     0.0
47943     0.0
47944     0.0
47945     0.0
Name: rides_previous_1_hour, Length: 47946, dtype: float64

In [5]:
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=6.1138


# Baseline Model 2
- Actual demand observed at t-7 days

In [6]:
import pandas as pd
import numpy as np

# previous week baseline model
class BaselineModelPreviousWeek:
    '''Prediction = actual demand observed at t-7 days'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f'rides_previous_{7*24}_hour']

In [7]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)
predictions

0         3.0
1         3.0
2        11.0
3        37.0
4        15.0
         ... 
47941     0.0
47942     0.0
47943     0.0
47944     0.0
47945     0.0
Name: rides_previous_168_hour, Length: 47946, dtype: float64

In [8]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=3.4420


# Baseline Model 3
- Actual demand observed at t-7 days, t-14 days, t-21 days, t-28

In [9]:
import pandas as pd
import numpy as np

# average of previous 4 weeks baseline model
class BaselineModelPrevious4Weeks:
    '''Prediction = actual demand observed at t-7 days, t-14 days, t-21 days, t-28'''
    
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
        
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return (
            X_test[f'rides_previous_{7*24}_hour'] + \
            X_test[f'rides_previous_{14*24}_hour'] + \
            X_test[f'rides_previous_{21*24}_hour'] + \
            X_test[f'rides_previous_{28*24}_hour']
            ) / 4

In [10]:
model = BaselineModelPrevious4Weeks()
predictions = model.predict(X_test)
predictions

0         2.25
1         6.50
2        13.25
3        29.50
4        24.75
         ...  
47941     0.00
47942     0.00
47943     0.00
47944     0.00
47945     0.00
Length: 47946, dtype: float64

In [11]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=3.0108
