In [36]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler

In [30]:
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

In [31]:
# Define the test and train variables
_target_column_name = "log_bike_count"

y_train = df_train[_target_column_name]
y_test = df_test[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])

In [9]:
# Define the function that encodes dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


## 1st version

In [10]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id"])

In [11]:
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id"])

In [12]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id"]

In [13]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.3499899	total: 255ms	remaining: 4m 14s
1:	learn: 1.0871559	total: 351ms	remaining: 2m 55s
2:	learn: 0.8810754	total: 451ms	remaining: 2m 29s
3:	learn: 0.7112625	total: 545ms	remaining: 2m 15s
4:	learn: 0.5732872	total: 655ms	remaining: 2m 10s
5:	learn: 0.4627966	total: 750ms	remaining: 2m 4s
6:	learn: 0.3742979	total: 845ms	remaining: 1m 59s
7:	learn: 0.3024079	total: 951ms	remaining: 1m 57s
8:	learn: 0.2440911	total: 1.05s	remaining: 1m 55s
9:	learn: 0.1977657	total: 1.15s	remaining: 1m 54s
10:	learn: 0.1612823	total: 1.26s	remaining: 1m 53s
11:	learn: 0.1317130	total: 1.34s	remaining: 1m 50s
12:	learn: 0.1084784	total: 1.44s	remaining: 1m 49s
13:	learn: 0.0899023	total: 1.53s	remaining: 1m 47s
14:	learn: 0.0749995	total: 1.63s	remaining: 1m 46s
15:	learn: 0.0633374	total: 1.72s	remaining: 1m 45s
16:	learn: 0.0545503	total: 1.81s	remaining: 1m 44s
17:	learn: 0.0477978	total: 1.91s	remaining: 1m 44s
18:	learn: 0.0428283	total: 2s	remaining: 1m 43s
19:	learn: 0.0391917	total

In [14]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [15]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.00
Test set, RMSE=0.01


## 2nd version (correspond à notre submission v5)

In [25]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [26]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id"]

In [27]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.4863639	total: 100ms	remaining: 1m 40s
1:	learn: 1.3284438	total: 193ms	remaining: 1m 36s
2:	learn: 1.2069104	total: 267ms	remaining: 1m 28s
3:	learn: 1.1145413	total: 350ms	remaining: 1m 27s
4:	learn: 1.0456807	total: 442ms	remaining: 1m 28s
5:	learn: 0.9919205	total: 549ms	remaining: 1m 31s
6:	learn: 0.9508473	total: 640ms	remaining: 1m 30s
7:	learn: 0.9177440	total: 731ms	remaining: 1m 30s
8:	learn: 0.8846627	total: 811ms	remaining: 1m 29s
9:	learn: 0.8590104	total: 888ms	remaining: 1m 27s
10:	learn: 0.8412286	total: 968ms	remaining: 1m 27s
11:	learn: 0.8260011	total: 1.06s	remaining: 1m 26s
12:	learn: 0.8139261	total: 1.14s	remaining: 1m 26s
13:	learn: 0.8023860	total: 1.23s	remaining: 1m 26s
14:	learn: 0.7892705	total: 1.33s	remaining: 1m 27s
15:	learn: 0.7808999	total: 1.43s	remaining: 1m 28s
16:	learn: 0.7749054	total: 1.52s	remaining: 1m 27s
17:	learn: 0.7658489	total: 1.61s	remaining: 1m 27s
18:	learn: 0.7610371	total: 1.7s	remaining: 1m 27s
19:	learn: 0.7561655	to

In [28]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [29]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.40
Test set, RMSE=0.46


## 3rd version (submission v7 (moins bonne que la v5))

In [33]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [34]:
date_encoder.fit_transform(X_train).head()

Unnamed: 0,counter_id,site_id,latitude,longitude,year,month,day,weekday,hour
48321,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,2
48324,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,3
48327,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,4
48330,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,15
48333,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,18


In [42]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

scaling_columns = ["latitude","longitude"]

# Apply StandardScaler to non-categorical columns and OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('standard-scaler', StandardScaler(), scaling_columns),  
    ])


categorical_cols = ["counter_id", "site_id", "weekday"]

In [43]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.4863639	total: 107ms	remaining: 1m 47s
1:	learn: 1.3279853	total: 214ms	remaining: 1m 46s
2:	learn: 1.2083307	total: 299ms	remaining: 1m 39s
3:	learn: 1.1174318	total: 409ms	remaining: 1m 41s
4:	learn: 1.0453651	total: 499ms	remaining: 1m 39s
5:	learn: 0.9932200	total: 589ms	remaining: 1m 37s
6:	learn: 0.9501384	total: 685ms	remaining: 1m 37s
7:	learn: 0.9103651	total: 787ms	remaining: 1m 37s
8:	learn: 0.8787650	total: 903ms	remaining: 1m 39s
9:	learn: 0.8573480	total: 1.02s	remaining: 1m 41s
10:	learn: 0.8367604	total: 1.11s	remaining: 1m 39s
11:	learn: 0.8231157	total: 1.2s	remaining: 1m 38s
12:	learn: 0.8116464	total: 1.29s	remaining: 1m 38s
13:	learn: 0.8003782	total: 1.38s	remaining: 1m 37s
14:	learn: 0.7925534	total: 1.47s	remaining: 1m 36s
15:	learn: 0.7820794	total: 1.57s	remaining: 1m 36s
16:	learn: 0.7731963	total: 1.67s	remaining: 1m 36s
17:	learn: 0.7516438	total: 1.79s	remaining: 1m 37s
18:	learn: 0.7444267	total: 1.89s	remaining: 1m 37s
19:	learn: 0.7378860	to

In [44]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [45]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.41
Test set, RMSE=0.43
