In [1]:
## Setup
from notebook_utils import resolve_paths_from_parent_directory
# auto reload notebook deps
%reload_ext autoreload
%autoreload 2
resolve_paths_from_parent_directory()

In [7]:
import pandas as pd

def custom_feature_set_without_trip_id(dataset):
    train, test = dataset
    train = pd.get_dummies(train, columns=['route'])

    # compute average passenger_count by next_stop_id (using training only so as not to bake in information about the test set)
    train_stop_stats = train[
        ['next_stop_id', 'passenger_count']
    ].groupby('next_stop_id').agg({'passenger_count':['mean', 'std']})
    train['avg_stop_passengers'] = train['next_stop_id'].apply(lambda x: train_stop_stats[('passenger_count', 'mean')].loc[x])
    test['avg_stop_passengers'] = test['next_stop_id'].apply(lambda x: train_stop_stats[('passenger_count', 'mean')].loc[x])

    test = pd.get_dummies(test, columns=['route'])

    non_features =  ['service_date', 'vehicle_id', 'trip_id', 'timestamp', 'prior_stop_id', 'next_stop_id']

    return train.drop(columns=non_features), test.drop(columns=non_features)

In [8]:
from data_loader import load_global_feature_set
from run_experiment import run_experiment
from feature_sets import feature_set_without_trip_id
from xgboost import XGBRegressor
from sklearn.linear_model import LassoCV

data_dir = "../../data"
route_str = "B46"

## Prepare globlal feature set
df_route, stop_dict = load_global_feature_set(data_dir, route_str)

for model in [LassoCV(), XGBRegressor()]:
    print(f"Running experiment for {model}...")
    experiment_eval = run_experiment(
        df_route,
        custom_feature_set_without_trip_id,
        model,
        stop_dict,
        test_period="24H"
    )

    print("Train evaluation")
    print(experiment_eval.basic_eval('train'))

    print("Test evaluation")
    print(experiment_eval.basic_eval('test'))


Running experiment for LassoCV()...
Selecting features...
fitting on train data until 2021-09-27 00:00:00-04:00: 220,788 rows
testing from 2021-09-27 00:00:00-04:00 to 2021-09-28 00:00:00-04:00: 2,894 rows
Fitting model...
Inference..
Evaluation
Performance: Model Prediction
MAE: 5.8
ME : 84.3
R^2: 0.31


Performance: Mean Prediction
MAE: 7.2
ME : 79.5
R^2: 0.00
None
Running experiment for XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree

Feature names must be in the same order as they were in fit.



Fitting model...
Inference..
Evaluation
Performance: Model Prediction
MAE: 5.0
ME : 74.3
R^2: 0.47


Performance: Mean Prediction
MAE: 7.2
ME : 79.5
R^2: 0.00
None
