External datapoints to leverage:
- Vacations
- Weather
- Lockdowns / curfews
- Fuel prices
- Car traffic

In [9]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import arrow
import dateutil.parser
import math

import warnings
warnings.filterwarnings("ignore")

data = pd.read_parquet(Path('data') / 'train.parquet')
test = pd.read_parquet(Path('data') / 'test.parquet')

# converting date into seperate columns for each aspect

data.groupby(['site_name', 'counter_name'])['bike_count'].sum().sort_values(ascending=False).head(30).to_frame()


Unnamed: 0_level_0,Unnamed: 1_level_0,bike_count
site_name,counter_name,Unnamed: 2_level_1
Totem 73 boulevard de Sébastopol,Totem 73 boulevard de Sébastopol S-N,1809231.0
Totem 64 Rue de Rivoli,Totem 64 Rue de Rivoli O-E,1406900.0
Totem 73 boulevard de Sébastopol,Totem 73 boulevard de Sébastopol N-S,1357868.0
67 boulevard Voltaire SE-NO,67 boulevard Voltaire SE-NO,1036575.0
Totem 64 Rue de Rivoli,Totem 64 Rue de Rivoli E-O,914089.0
27 quai de la Tournelle,27 quai de la Tournelle SE-NO,888717.0
Quai d'Orsay,Quai d'Orsay E-O,849724.0
Totem Cours la Reine,Totem Cours la Reine O-E,806149.0
Face au 48 quai de la marne,Face au 48 quai de la marne SO-NE,806071.0
Face au 48 quai de la marne,Face au 48 quai de la marne NE-SO,759194.0


In [24]:
__file__ = Path('submissions') /  'external_data' /  'estimator.py'


def _merge_external_data(X, file_name, columns):
    file_path = Path(__file__).parent / file_name
    df_ext = pd.read_csv(file_path, parse_dates=['date'])
    
    X = X.copy()
    # When using merge_asof left frame need to be sorted
    X['orig_index'] = np.arange(X.shape[0])
    X = pd.merge_asof(X.sort_values('date'), df_ext[columns].sort_values('date'), on='date')
    # Sort back to the original order
    X = X.sort_values('orig_index')
    del X['orig_index']
    return X

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, 'year'] = X['date'].dt.year
    X.loc[:, 'month'] = X['date'].dt.month
    X.loc[:, 'day'] = X['date'].dt.day
    X.loc[:, 'weekday'] = X['date'].dt.weekday
    X.loc[:, 'hour'] = X['date'].dt.hour
    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"]) 

def _cycl_encode(X):
    X = X.copy()
    month_norm = 2 * math.pi * X['date'].dt.month / X['date'].dt.month.max()
    day_norm = 2 * math.pi * X['date'].dt.day / X['date'].dt.day.max()
    weekday_norm = 2 * math.pi * X['date'].dt.weekday / X['date'].dt.weekday.max()
    hour_norm = 2 * math.pi * X['date'].dt.hour / X['date'].dt.hour.max()
    X.loc[:, 'month_sin'] = np.sin(month_norm)
    X.loc[:, 'month_cos'] = np.cos(month_norm)
    X.loc[:, 'day_sin'] = np.sin(day_norm)
    X.loc[:, 'day_cos'] = np.cos(day_norm)
    X.loc[:, 'weekday_sin'] = np.sin(weekday_norm)
    X.loc[:, 'weekday_cos'] = np.cos(weekday_norm)
    X.loc[:, 'hour_sin'] = np.sin(hour_norm)
    X.loc[:, 'hour_cos'] = np.cos(hour_norm)
    return X.drop(columns=["date"]) 


In [25]:
import problem

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

In [26]:
#merging train data
X_train = _merge_external_data(X_train, 'external_data.csv', ['date', 't', 'ff', 'u'])
X_train = _merge_external_data(X_train, 'brent-crude-prices.csv', ['date', 'price'])
X_train = _merge_external_data(X_train, 'school-holidays.csv', ['date', 'vacances_zone_c'])
X_train = _merge_external_data(X_train, 'bank-holidays.csv', ['date', 'val'])

In [27]:
X_train

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude,t,ff,u,price,vacances_zone_c,val
107,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 02:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,285.75,1.6,81,45.22,False,False
157,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 03:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,283.95,1.1,88,45.22,False,False
193,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 04:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,283.95,1.1,88,45.22,False,False
769,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 15:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,293.65,4.0,41,45.72,False,False
959,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 18:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,292.15,3.0,47,45.72,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453515,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-08-08 18:00:00,2020-11-29,Y2H20114504,48.839770,2.301980,293.05,4.1,53,71.02,True,False
453934,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-08-09 02:00:00,2020-11-29,Y2H20114504,48.839770,2.301980,288.65,2.3,79,71.02,True,False
454316,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-08-09 08:00:00,2020-11-29,Y2H20114504,48.839770,2.301980,287.15,3.3,88,71.02,True,False
454411,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-08-09 10:00:00,2020-11-29,Y2H20114504,48.839770,2.301980,289.75,3.5,85,71.02,True,False


In [5]:
#merging for test data
X_test = _merge_external_data(X_test, 'external_data.csv', ['date', 't', 'ff', 'u'])
X_test = _merge_external_data(X_test, 'brent-crude-prices.csv', ['date', 'price'])
X_test = _merge_external_data(X_test, 'school-holidays.csv', ['date', 'vacances_zone_c'])
X_test = _merge_external_data(X_test, 'bank-holidays.csv', ['date', 'val'])

In [None]:
car_import = pd.read_csv('./submissions/external_data/car-traffic.csv', sep=';')

In [None]:
# adding car traffic info to dataframe (in progress still)
car_traffic = car_import[['Libelle', 'date', 'Debit horaire', 'Etat trafic' ]]
car_traffic['Libelle'] = car_traffic['Libelle'].replace(['Bd_Sebastopol', 'Bd_Voltaire', 'Quai_Francois_Mauriac', 'Sevres',
       "Quai_d'Orsay", 'Julia_Bartet', 'Turbigo', 'Quai_de_Grenelle',
       'Quai_Tournelle', 'Concorde', 'Cours_La_Reine']
        , ['Totem 73 boulevard de Sébastopol S-N', '67 boulevard Voltaire SE-NO', '39 quai François Mauriac SE-NO'
        , '90 Rue De Sèvres NE-SO', "Quai d'Orsay E-O", '6 rue Julia Bartet NE-SO'
        , '38 rue Turbigo NE-SO', '36 quai de Grenelle NE-SO', '27 quai de la Tournelle NO-SE'
        , 'Pont de la Concorde S-N','Totem Cours la Reine O-E'])
car_traffic['date'] = pd.to_datetime(car_traffic['date'])

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer

date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[['date']]).columns.tolist()



binary_encoding_columns = ['vacances_zone_c', 'val']
one_hot_encoding_columns = ['counter_name', 'site_name']
numeric_cols = ['t', 'ff', 'u', 'price']



preprocessor = ColumnTransformer([
    ('binary-encoder', OrdinalEncoder(), binary_encoding_columns),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore'),
     one_hot_encoding_columns),
    ('numeric', 'passthrough', numeric_cols)
])


In [37]:
date_cols

['year', 'month', 'day', 'weekday', 'hour']

In [38]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

cv = TimeSeriesSplit(n_splits=6)

model = make_pipeline(date_encoder, 
    preprocessor, HistGradientBoostingRegressor(random_state=0)
)

scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_root_mean_squared_error')
print(f'RMSE: {-scores.mean():.3} ± {(-scores).std():.3}')



RMSE: nan ± nan


In [39]:
date_encoder = FunctionTransformer(_cycl_encode)
date_cols = _cycl_encode(X_train[['date']]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name", 'vacances_zone_c', 'val']

preprocessor = ColumnTransformer([
    ('date', "passthrough", date_cols),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
    ('numeric', 'passthrough', numeric_cols)
])

regressor = HistGradientBoostingRegressor(random_state=0)

pipe =  make_pipeline(date_encoder, preprocessor, regressor)
pipe.fit(X_train, y_train)

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function _cycl_encode at 0x0000028DA97BAAF0>)),
                ('columntransformer',
                 ColumnTransformer(transformers=[('date', 'passthrough',
                                                  ['month_sin', 'month_cos',
                                                   'day_sin', 'day_cos',
                                                   'weekday_sin', 'weekday_cos',
                                                   'hour_sin', 'hour_cos']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['counter_name', 'site_name',
                                                   'vacances_zone_c', 'val']),
                                                 

In [40]:
scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='neg_root_mean_squared_error', error_score=np.nan)
print(f'RMSE: {-scores.mean():.3} ± {(-scores).std():.3}')


RMSE: 1.13 ± 0.228
