In [1]:
## Create a baseline model without ML to obtain a baseline performance
## Metric error (MAE - Mean Absolute Error)

In [2]:
# Ways to improve model:

# 1. Increase training size
# 2. Add more features to training data
# 3. Try another algorithm (K-nearest, Random Forest, Logistic Regression, XGBoost, etc. etc.) - Tabular dataset best models: XGBoost, LightGBM, CatBoost 
# 4. Hyperparameter tuning (max_depth, learning_rate, n_estimators, gamma, etc. etc.)

In [3]:
import pandas as pd
from taxi_demand_predictor.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73135,3.0,4.0,1.0,1.0,1.0,0.0,1.0,2.0,2.0,3.0,...,29.0,15.0,4.0,12.0,7.0,11.0,4.0,2022-10-27,265,12.0
73136,9.0,4.0,3.0,3.0,2.0,4.0,2.0,6.0,1.0,3.0,...,29.0,17.0,12.0,4.0,5.0,8.0,9.0,2022-10-28,265,3.0
73137,7.0,6.0,3.0,4.0,3.0,5.0,7.0,6.0,5.0,10.0,...,29.0,13.0,9.0,10.0,5.0,10.0,7.0,2022-10-29,265,6.0
73138,6.0,5.0,8.0,6.0,6.0,0.0,1.0,2.0,8.0,6.0,...,8.0,10.0,7.0,3.0,3.0,6.0,2.0,2022-10-30,265,10.0


In [4]:
from taxi_demand_predictor.data_split import train_test_split
from datetime import datetime

X_train, y_train, X_test, y_test = train_test_split(df, cutoff_date=datetime(2022,6, 1, 0, 0, 0), target_column_name='target_rides_next_hour')

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(40545, 674)
y_test.shape=(40545,)


In [5]:
import numpy as np

class BaselineModelPreviousHour:
    """
    Prediction = actual demand observed 1 hour ago
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.DataFrame):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f'rides_previous_1_hour']

In [6]:
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)
predictions

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
40540    4.0
40541    9.0
40542    7.0
40543    2.0
40544    4.0
Name: rides_previous_1_hour, Length: 40545, dtype: float32

In [7]:
## How good are these predictions?
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=5.9674


In [8]:
# Predict the next hour using seasonal data
import pandas as pd
import numpy as np

class BaselineModelPreviousWeek:
    """
    Predict the next hour using the same hour of the previous week
    """
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """
        Fit the model
        """
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f'rides_previous_{7*24}_hour']

In [9]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

In [10]:
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=3.3355


In [11]:
# What about taking an average over the last month?

import pandas as pd
import numpy as np

class BaselineModelLast4Weeks:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass
    
    def predict(self, X_test: pd.DataFrame) -> pd.Series:
        return 0.25 * (
            X_test[f'rides_previous_{7*24}_hour'] + \
            X_test[f'rides_previous_{2*7*24}_hour'] + \
            X_test[f'rides_previous_{3*7*24}_hour'] + \
            X_test[f'rides_previous_{4* 7*24}_hour']
        )

In [12]:
model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)

test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.8517


In [13]:
# Final MAE = 2.9