In [2]:
import warnings  # suppress future warnings

import pandas as pd
import holidays
import numpy as np
import psycopg2

import matplotlib.pyplot as plt
import seaborn as sns

from typing import Optional

from pandas.plotting import register_matplotlib_converters

pd.options.display.float_format = '{:.3f}'.format
register_matplotlib_converters()

# set settings for seaborn
sns.set_style(style="whitegrid", rc={"grid.color": ".9"})
sns.set_palette(palette="deep")
sns_c = sns.color_palette(palette="deep")

# customise matplotlib and sns plot dimensions
plt.rcParams["figure.figsize"] = [12, 6]
plt.rcParams["figure.dpi"] = 100
title_font = {"fontname": "DejaVu Sans Mono"}

conn = psycopg2.connect("dbname = bikemi user=luca")

warnings.simplefilter(action="ignore", category=FutureWarning)


def retrieve_daily_data(connection, wide: bool = False) -> pd.DataFrame:
    query = """
        SELECT
            cdr.data_partenza,
            cdr.cluster,
            cdr.noleggi_giornalieri
        FROM bikemi_rentals.clusters_daily_rentals cdr
    """

    if not wide:
        return pd.read_sql(query, connection).rename(index=str.title)
    return (
        pd.read_sql(query, connection)
            .pivot(index="data_partenza", columns="cluster", values="noleggi_giornalieri")
            .rename(columns=str.title)
            .asfreq("D")
    )

daily_rentals: pd.DataFrame = (
    retrieve_daily_data(conn, wide=True)
)

In [5]:
def extract_features(y: pd.Series) -> pd.DataFrame:
    def get_holidays(series: pd.Series) -> pd.Series:
        return (
            series
                .pipe(pd.DataFrame)
                .reset_index()["data_partenza"]
                .apply(lambda d: holidays.CountryHoliday("IT").get(d))
                .astype("category")
        )

    data = pd.DataFrame(y)

    data["year"] = y.index.year
    data["month"] = y.index.month
    data["week"] = y.index.week
    data["weekday"] = y.index.weekday
    data["day"] = y.index.day

    data["holiday"] = y.pipe(get_holidays)

    return data


y: pd.DataFrame = (
    retrieve_daily_data(conn, wide=True)
    .filter(["Brera - 25"])
    .pipe(extract_features)
)

cluster,Brera - 25,year,month,week,weekday,day,holiday
data_partenza,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-06-01,245,2015,6,23,0,1,
2015-06-02,214,2015,6,23,1,2,
2015-06-03,375,2015,6,23,2,3,
2015-06-04,491,2015,6,23,3,4,
2015-06-05,427,2015,6,23,4,5,


## Multivariate Time Series: Random Forest

The random forest regression (or classification) is a popular ensemble method that was introduced in 2001 by Leo
Breiman. Random Forests


while Gradient Boosting Regression Tree was developed by Friedman[28][29]–[30] based on the work of Leo Breiman.Both algorithms can fit trees by selecting a random subset of the predictors from the original data and outputs a classification or a regression prediction.However, GBRT sequentially fits the trees.A robust model is built by minimizing the errors of the previous trees each time a tree is added (analysis and prediction)


No data transformation but extract features and convert categoricals


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import TimeSeriesSplit

ts_cv = TimeSeriesSplit(
    n_splits=5,
    gap=6,
    max_train_size=10000,
    test_size=1000,
)

### Configurations
multi agent system Salamanca:
Extra Tree Regressor: learning rate: [0.1, 0.01, 0.001], subsample: [1.0, 0.9, 0.8], max depth: [3, 5, 7], min samples leaf: [1, 3, 5]
Random Forest Regressor: criterion: [mae, mse], number estimators: [10, 100, 1000], max features: [auto, sqrt, log2]
Gradient Boosting Regressor: learning rate: [0.1, 0.01, 0.001], subsample: [1.0, 0.9, 0.8], max depth: [3, 5, 7], min samples leaf: [1, 3, 5]







In [None]:
gbrt_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
    ),
    HistGradientBoostingRegressor(
        categorical_features=range(4),
    ),
)