In [23]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [24]:
# Lecture des fichiers
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

# Define the test and train variables
_target_column_name = "log_bike_count"
y_train = df_train[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])
y_test = df_test[_target_column_name]

In [25]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [26]:
# Drop the unuseful columns
X_train = X_train.drop(columns=[
    "counter_name", "site_name", "counter_technical_id", "counter_installation_date"
    , "bike_count"])
X_test = X_test.drop(columns=[
    "counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [27]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)

In [28]:
# Create a column for holidays 

vacances_scolaires = [
    ('2020-10-17', '2020-11-02'),  
    ('2020-12-19', '2021-01-04'),  
    ('2021-02-20', '2021-03-08'),  
    ('2021-04-10', '2021-04-26'), 
    ('2021-07-10', '2021-09-01'),  
    ('2021-10-23', '2021-11-08'),  
    ('2021-12-18', '2022-01-03'),  
]

for i, (debut, fin) in enumerate(vacances_scolaires):
    vacances_scolaires[i] = (pd.to_datetime(debut), pd.to_datetime(fin))

X_train['vacances'] = 0
X_test['vacances'] = 0

for debut, fin in vacances_scolaires:
    X_train.loc[(X_train['date'] >= debut) & (X_train['date'] <= fin), 'vacances'] = 1
    X_test.loc[(X_test['date'] >= debut) & (X_test['date'] <= fin), 'vacances'] = 1

In [29]:
# Ajoutons le paramètre COVID 
confinement_dates = pd.DataFrame({
    'debut': ['2020-03-17', '2020-10-30', '2021-04-03'],
    'fin': ['2020-05-11', '2020-12-15', '2021-05-03']
})

couvre_feu_dates = pd.DataFrame({
    'debut2': ['2020-10-17', '2020-12-15'],
    'fin2': ['2020-12-15', '2021-06-01']
})

confinement_dates['debut'] = pd.to_datetime(confinement_dates['debut'])
confinement_dates['fin'] = pd.to_datetime(confinement_dates['fin'])

couvre_feu_dates['debut2'] = pd.to_datetime(couvre_feu_dates['debut2'])
couvre_feu_dates['fin2'] = pd.to_datetime(couvre_feu_dates['fin2'])

def add_covid_features(data, confinement_dates, couvre_feu_dates):
    # Create a new column 'periode' initially set to 0
    data['periode'] = 0

    # Traverse the confinement periods
    for _, row in confinement_dates.iterrows():
        data.loc[
            (data['date'] >= row['debut']) & (data['date'] <= row['fin']),
            'periode'
        ] = 2

    # Traverse the curfew periods
    for _, row in couvre_feu_dates.iterrows():
        if row['fin2'] is not None:
            data.loc[
                (data['date'] >= row['debut2']) & (data['date'] <= row['fin2']) &
                (data['periode'] != 2), 
                'periode'
            ] = 1
        else:
            data.loc[
                (data['date'] >= row['debut2']) &
                (data['periode'] != 2),  
                'periode'
            ] = 1

    # Check if a date is both in confinement and curfew and assign 2
    data['periode'] = data.groupby('date')['periode'].transform('max')

add_covid_features(X_train, confinement_dates, couvre_feu_dates)
add_covid_features(X_test, confinement_dates, couvre_feu_dates)

In [30]:
# Encode the dates
X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

In [63]:
# Columns to be used in the model
selected_columns = ['counter_id', 'site_id', 'year', 'month', 'day', 'weekday', 'hour', 'vacances', 'periode']

X_train_selected = X_train[selected_columns]
X_test_selected = X_test[selected_columns]

X_train_selected['site_id'] = X_train_selected['site_id'].astype('category')
X_test_selected['site_id'] = X_test_selected['site_id'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_selected['site_id'] = X_train_selected['site_id'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_selected['site_id'] = X_test_selected['site_id'].astype('category')


In [50]:
# Create our Pipeline
regressor = XGBRegressor(learning_rate=0.2, n_estimators=1000, enable_categorical=True)

regressor.fit(X_train_selected, y_train)

In [51]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={np.sqrt(mean_squared_error(y_train, regressor.predict(X_train_selected), squared=False)):.2f}"
)
print(
    f"Test set, RMSE={np.sqrt(mean_squared_error(y_test, regressor.predict(X_test_selected), squared=False)):.2f}"
)

Train set, RMSE=0.56
Test set, RMSE=0.70


In [65]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

cv = TimeSeriesSplit(n_splits=2)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    regressor, X_train_selected, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

RMSE:  [-2.18105267 -2.02453302]
RMSE (all folds): 2.1 ± 0.0783


In [66]:
from sklearn.model_selection import cross_val_score


# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    regressor, X_train_selected, y_train, cv=5, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

RMSE:  [-1.31406866 -1.06016814 -1.0313511  -1.24357156 -2.11745673]
RMSE (all folds): 1.35 ± 0.397
