In [13]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler

In [14]:
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

In [15]:
# Define the test and train variables
_target_column_name = "log_bike_count"

y_train = df_train[_target_column_name]
y_test = df_test[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])

In [16]:
# Define the function that encodes dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


## 1st version

J'ai changé l'approche de catboost en spécifiant les cat_features (ici: counter_id et site_id)

In [10]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id"])

In [11]:
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id"])

In [11]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id"]

In [13]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.3499899	total: 255ms	remaining: 4m 14s
1:	learn: 1.0871559	total: 351ms	remaining: 2m 55s
2:	learn: 0.8810754	total: 451ms	remaining: 2m 29s
3:	learn: 0.7112625	total: 545ms	remaining: 2m 15s
4:	learn: 0.5732872	total: 655ms	remaining: 2m 10s
5:	learn: 0.4627966	total: 750ms	remaining: 2m 4s
6:	learn: 0.3742979	total: 845ms	remaining: 1m 59s
7:	learn: 0.3024079	total: 951ms	remaining: 1m 57s
8:	learn: 0.2440911	total: 1.05s	remaining: 1m 55s
9:	learn: 0.1977657	total: 1.15s	remaining: 1m 54s
10:	learn: 0.1612823	total: 1.26s	remaining: 1m 53s
11:	learn: 0.1317130	total: 1.34s	remaining: 1m 50s
12:	learn: 0.1084784	total: 1.44s	remaining: 1m 49s
13:	learn: 0.0899023	total: 1.53s	remaining: 1m 47s
14:	learn: 0.0749995	total: 1.63s	remaining: 1m 46s
15:	learn: 0.0633374	total: 1.72s	remaining: 1m 45s
16:	learn: 0.0545503	total: 1.81s	remaining: 1m 44s
17:	learn: 0.0477978	total: 1.91s	remaining: 1m 44s
18:	learn: 0.0428283	total: 2s	remaining: 1m 43s
19:	learn: 0.0391917	total

In [14]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [15]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.00
Test set, RMSE=0.01


## 2nd version (correspond à notre submission v5 avec meilleur score: 0.646)

J'ai supprimé toutes les colonnes qui me semblaient inutiles (counter installation date, counter technical id)

Donc, dans l'ordre, les plus importantes sont heures, mois, puis site_id. Le moins important est year.

## 3rd version (submission v7 (score:0.6488))

J'ai rajouté weekday en categorical. J'ai voulu faire un StandardScaler mais ça a pas fonctionné donc pour l'instant il est pas dans le pipe.

In [61]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [62]:
date_encoder.fit_transform(X_train).head()

Unnamed: 0,counter_id,site_id,latitude,longitude,year,month,day,weekday,hour
48321,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,2
48324,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,3
48327,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,4
48330,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,15
48333,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,18


In [8]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

scaling_columns = ["latitude","longitude"]

# Apply StandardScaler to non-categorical columns and OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('standard-scaler', StandardScaler(), scaling_columns),  
    ])


categorical_cols = ["counter_id", "site_id", "weekday"]

In [64]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.4863639	total: 131ms	remaining: 2m 10s
1:	learn: 1.3279853	total: 263ms	remaining: 2m 11s
2:	learn: 1.2083307	total: 376ms	remaining: 2m 5s
3:	learn: 1.1174318	total: 507ms	remaining: 2m 6s
4:	learn: 1.0453651	total: 605ms	remaining: 2m
5:	learn: 0.9932200	total: 701ms	remaining: 1m 56s
6:	learn: 0.9501384	total: 787ms	remaining: 1m 51s
7:	learn: 0.9103651	total: 913ms	remaining: 1m 53s
8:	learn: 0.8787650	total: 1.02s	remaining: 1m 52s
9:	learn: 0.8573480	total: 1.11s	remaining: 1m 50s
10:	learn: 0.8367604	total: 1.21s	remaining: 1m 48s
11:	learn: 0.8231157	total: 1.31s	remaining: 1m 47s
12:	learn: 0.8116464	total: 1.4s	remaining: 1m 46s
13:	learn: 0.8003782	total: 1.49s	remaining: 1m 44s
14:	learn: 0.7925534	total: 1.58s	remaining: 1m 43s
15:	learn: 0.7820794	total: 1.67s	remaining: 1m 42s
16:	learn: 0.7731963	total: 1.76s	remaining: 1m 41s
17:	learn: 0.7516438	total: 1.9s	remaining: 1m 43s
18:	learn: 0.7444267	total: 2.02s	remaining: 1m 44s
19:	learn: 0.7378860	total: 2.

KeyboardInterrupt: 

In [44]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [45]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.41
Test set, RMSE=0.43


## 4th version 

J'ai essayé de faire un grid search sur la v5 (meilleur score). Pas encore obtenu les résultats.

In [5]:
from sklearn.model_selection import GridSearchCV

In [6]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [9]:
date_encoder = FunctionTransformer(_encode_dates)
date_encoder.fit_transform(X_train).head()

Unnamed: 0,counter_id,site_id,latitude,longitude,year,month,day,weekday,hour
48321,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,2
48324,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,3
48327,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,4
48330,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,15
48333,100007049-102007049,100007049,48.846028,2.375429,2020,9,1,1,18


In [10]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

scaling_columns = ["latitude","longitude"]

# Apply StandardScaler to non-categorical columns and OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('standard-scaler', StandardScaler(), scaling_columns),  
    ])


categorical_cols = ["counter_id", "site_id"]

In [12]:
#Create our Pipeline
regressor = CatBoostRegressor(cat_features=categorical_cols)

X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

model = regressor

param_grid = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth': [5, 10, 15]
 }
model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=4, cv=1, scoring='neg_root_mean_squared_error')
model_grid_search.fit(X_train, y_train)

KeyError: 'date'

In [3]:
best_model = model_grid_search.best_estimator_
print(f"The best set of parameters is: {model_grid_search.best_params_}")

NameError: name 'model_grid_search' is not defined

In [None]:
# Find the predictions
y_pred = best_model.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [None]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, best_model.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, best_model.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.41
Test set, RMSE=0.43


## 5th version 

J'ai remis weekday en categorical, mais cette fois j'ai convertis en plus de ça la colonne weekday en valeurs "catégorielles". J'ai du l'enlever du pipe pcq ça aurait pas marché et flemme de creuser

In [8]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [13]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id", "weekday"]

In [14]:
#Create our Pipeline
regressor = CatBoostRegressor(cat_features=categorical_cols)

X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

model = regressor

model.fit(X_train, y_train)

Learning rate set to 0.107689
0:	learn: 1.5710333	total: 490ms	remaining: 8m 9s
1:	learn: 1.4716580	total: 883ms	remaining: 7m 20s
2:	learn: 1.3855158	total: 1.1s	remaining: 6m 5s
3:	learn: 1.3102268	total: 1.52s	remaining: 6m 17s
4:	learn: 1.2478465	total: 1.9s	remaining: 6m 17s
5:	learn: 1.1925100	total: 2.26s	remaining: 6m 14s
6:	learn: 1.1431641	total: 2.62s	remaining: 6m 11s
7:	learn: 1.0996859	total: 2.88s	remaining: 5m 57s
8:	learn: 1.0618449	total: 3.29s	remaining: 6m 2s
9:	learn: 1.0297394	total: 3.78s	remaining: 6m 13s
10:	learn: 1.0008008	total: 4.12s	remaining: 6m 10s
11:	learn: 0.9767151	total: 4.42s	remaining: 6m 3s
12:	learn: 0.9528968	total: 4.63s	remaining: 5m 51s
13:	learn: 0.9301272	total: 4.96s	remaining: 5m 49s
14:	learn: 0.9096483	total: 5.21s	remaining: 5m 41s
15:	learn: 0.8931992	total: 5.52s	remaining: 5m 39s
16:	learn: 0.8785742	total: 5.72s	remaining: 5m 30s
17:	learn: 0.8657167	total: 6s	remaining: 5m 27s
18:	learn: 0.8549320	total: 6.3s	remaining: 5m 25s
19

<catboost.core.CatBoostRegressor at 0x1f840f84d50>

In [16]:
# Find the predictions
y_pred = model.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [17]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.43
Test set, RMSE=0.45


## 6th version (score: 0.6488) 

Exactement la même chose que la 6ème version sauf que dans le catboost regressor, j'ai remis les hyperparamètres qui nous avaient donné notre meilleur score.

In [20]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [21]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id", "weekday"]

In [22]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

model = regressor

model.fit(X_train, y_train)

0:	learn: 1.4863639	total: 501ms	remaining: 8m 20s
1:	learn: 1.3279853	total: 1000ms	remaining: 8m 18s
2:	learn: 1.2083307	total: 1.65s	remaining: 9m 9s
3:	learn: 1.1174318	total: 2.32s	remaining: 9m 37s
4:	learn: 1.0453651	total: 2.62s	remaining: 8m 41s
5:	learn: 0.9932200	total: 2.97s	remaining: 8m 12s
6:	learn: 0.9501384	total: 3.56s	remaining: 8m 25s
7:	learn: 0.9103651	total: 4.37s	remaining: 9m 1s
8:	learn: 0.8787650	total: 4.85s	remaining: 8m 54s
9:	learn: 0.8573480	total: 5.22s	remaining: 8m 37s
10:	learn: 0.8367604	total: 5.68s	remaining: 8m 30s
11:	learn: 0.8231157	total: 6.07s	remaining: 8m 19s
12:	learn: 0.8116464	total: 6.52s	remaining: 8m 14s
13:	learn: 0.8003782	total: 6.88s	remaining: 8m 4s
14:	learn: 0.7925534	total: 7.28s	remaining: 7m 57s
15:	learn: 0.7820794	total: 7.65s	remaining: 7m 50s
16:	learn: 0.7731963	total: 8.01s	remaining: 7m 43s
17:	learn: 0.7516438	total: 8.36s	remaining: 7m 36s
18:	learn: 0.7444267	total: 8.71s	remaining: 7m 29s
19:	learn: 0.7378860	tot

<catboost.core.CatBoostRegressor at 0x1f840bbb950>

In [23]:
# Find the predictions
y_pred = model.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [24]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.41
Test set, RMSE=0.43


## 7th version 

Même chose que 6e sauf que je rajoute année, mois et jour en variables catégorielles.

In [27]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [28]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id", "weekday", "year", "month", "day"]

In [29]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

model = regressor

model.fit(X_train, y_train)

0:	learn: 1.4809223	total: 1.09s	remaining: 18m 6s
1:	learn: 1.3188419	total: 2.12s	remaining: 17m 39s
2:	learn: 1.1913749	total: 2.71s	remaining: 14m 59s
3:	learn: 1.0999509	total: 3.5s	remaining: 14m 31s
4:	learn: 1.0292512	total: 4.22s	remaining: 14m
5:	learn: 0.9704917	total: 4.91s	remaining: 13m 34s
6:	learn: 0.9242818	total: 5.53s	remaining: 13m 4s
7:	learn: 0.8725374	total: 6.07s	remaining: 12m 32s
8:	learn: 0.8347912	total: 6.67s	remaining: 12m 14s
9:	learn: 0.8077385	total: 6.96s	remaining: 11m 29s
10:	learn: 0.7828901	total: 7.81s	remaining: 11m 42s
11:	learn: 0.7633071	total: 8.47s	remaining: 11m 37s
12:	learn: 0.7483775	total: 9.25s	remaining: 11m 42s
13:	learn: 0.7318240	total: 10.2s	remaining: 11m 58s
14:	learn: 0.7226777	total: 11s	remaining: 12m 3s
15:	learn: 0.7151299	total: 11.7s	remaining: 12m 1s
16:	learn: 0.7039864	total: 12.3s	remaining: 11m 53s
17:	learn: 0.6936075	total: 12.8s	remaining: 11m 36s
18:	learn: 0.6834031	total: 13.4s	remaining: 11m 30s
19:	learn: 0.6

<catboost.core.CatBoostRegressor at 0x1f840efd050>

In [30]:
# Find the predictions
y_pred = model.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [31]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.39
Test set, RMSE=0.52


Reste à:
- essayer un StandardScaler sur les données numériques de la v5
- utiliser GridSearch
- analyser avec video youtube

## 8th version 

en partant de 2nd version (scaler latitude et longitude)

In [64]:
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

In [65]:
# Define the test and train variables
_target_column_name = "log_bike_count"

y_train = df_train[_target_column_name]
y_test = df_test[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])

In [66]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [67]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

scaling_columns = ["latitude","longitude"]

# Apply StandardScaler to non-categorical columns and OneHotEncoder to categorical columns
scaler = StandardScaler()
X_train_scaled = X_train.copy()  # Create a copy to avoid modifying the original DataFrame
X_train_scaled[scaling_columns] = scaler.fit_transform(X_train_scaled[scaling_columns])
X_train = X_train_scaled

X_test_scaled = X_test.copy()  # Create a copy to avoid modifying the original DataFrame
X_test_scaled[scaling_columns] = scaler.fit_transform(X_test_scaled[scaling_columns])
X_test = X_test_scaled


categorical_cols = ["counter_id", "site_id"]

X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

In [68]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.4863639	total: 108ms	remaining: 1m 47s
1:	learn: 1.3284438	total: 206ms	remaining: 1m 42s
2:	learn: 1.2069104	total: 288ms	remaining: 1m 35s
3:	learn: 1.1145413	total: 373ms	remaining: 1m 32s
4:	learn: 1.0456807	total: 468ms	remaining: 1m 33s
5:	learn: 0.9919205	total: 581ms	remaining: 1m 36s
6:	learn: 0.9508473	total: 698ms	remaining: 1m 39s
7:	learn: 0.9177440	total: 785ms	remaining: 1m 37s
8:	learn: 0.8846627	total: 882ms	remaining: 1m 37s
9:	learn: 0.8590104	total: 960ms	remaining: 1m 35s
10:	learn: 0.8412286	total: 1.04s	remaining: 1m 33s
11:	learn: 0.8260011	total: 1.16s	remaining: 1m 35s
12:	learn: 0.8139261	total: 1.25s	remaining: 1m 34s
13:	learn: 0.8023860	total: 1.34s	remaining: 1m 34s
14:	learn: 0.7892705	total: 1.43s	remaining: 1m 34s
15:	learn: 0.7808999	total: 1.53s	remaining: 1m 34s
16:	learn: 0.7749054	total: 1.62s	remaining: 1m 33s
17:	learn: 0.7658489	total: 1.72s	remaining: 1m 33s
18:	learn: 0.7610371	total: 1.81s	remaining: 1m 33s
19:	learn: 0.7561655	t

In [69]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [70]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.40
Test set, RMSE=0.47


In [71]:
# Afficher l'importance des fonctionnalités
feature_importances = regressor.get_feature_importance()
print("Feature Importances:", feature_importances)

Feature Importances: [ 6.03328875  9.41205077  4.90303661  5.00819343  1.19736619 15.34326816
  3.81299618  5.065529   49.22427092]


In [72]:
date_encoder.fit_transform(X_train).head()

Unnamed: 0,counter_id,site_id,latitude,longitude,year,month,day,weekday,hour
48321,100007049-102007049,100007049,-0.448798,0.786581,2020,9,1,1,2
48324,100007049-102007049,100007049,-0.448798,0.786581,2020,9,1,1,3
48327,100007049-102007049,100007049,-0.448798,0.786581,2020,9,1,1,4
48330,100007049-102007049,100007049,-0.448798,0.786581,2020,9,1,1,15
48333,100007049-102007049,100007049,-0.448798,0.786581,2020,9,1,1,18


## 9th version 

en partant de 2nd version (scaler tout ce qui n'est pas catégoriel)

In [82]:
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

In [83]:
# Define the test and train variables
_target_column_name = "log_bike_count"

y_train = df_train[_target_column_name]
y_test = df_test[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])

In [84]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [85]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

scaling_columns = ["latitude","longitude"]

# Apply StandardScaler to non-categorical columns and OneHotEncoder to categorical columns
scaler = StandardScaler()
X_train_scaled = X_train.copy()  # Create a copy to avoid modifying the original DataFrame
X_train_scaled[scaling_columns] = scaler.fit_transform(X_train_scaled[scaling_columns])
X_train = X_train_scaled

X_test_scaled = X_test.copy()  # Create a copy to avoid modifying the original DataFrame
X_test_scaled[scaling_columns] = scaler.fit_transform(X_test_scaled[scaling_columns])
X_test = X_test_scaled


categorical_cols = ["year", "month", "day", "weekday", "hour", "counter_id", "site_id"]

X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

In [86]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.4473337	total: 131ms	remaining: 2m 11s
1:	learn: 1.2788444	total: 208ms	remaining: 1m 43s
2:	learn: 1.1483996	total: 314ms	remaining: 1m 44s
3:	learn: 1.0513394	total: 436ms	remaining: 1m 48s
4:	learn: 0.9802232	total: 534ms	remaining: 1m 46s
5:	learn: 0.9085719	total: 684ms	remaining: 1m 53s
6:	learn: 0.8544632	total: 835ms	remaining: 1m 58s
7:	learn: 0.8116655	total: 991ms	remaining: 2m 2s
8:	learn: 0.7703017	total: 1.18s	remaining: 2m 9s
9:	learn: 0.7428834	total: 1.35s	remaining: 2m 13s
10:	learn: 0.7239911	total: 1.49s	remaining: 2m 13s
11:	learn: 0.7045594	total: 1.64s	remaining: 2m 15s
12:	learn: 0.6861900	total: 1.76s	remaining: 2m 13s
13:	learn: 0.6725700	total: 1.86s	remaining: 2m 11s
14:	learn: 0.6619549	total: 1.99s	remaining: 2m 10s
15:	learn: 0.6482998	total: 2.13s	remaining: 2m 10s
16:	learn: 0.6413385	total: 2.26s	remaining: 2m 10s
17:	learn: 0.6351864	total: 2.37s	remaining: 2m 9s
18:	learn: 0.6247449	total: 2.48s	remaining: 2m 8s
19:	learn: 0.6198627	total

In [87]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [88]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.2f}"
)

Train set, RMSE=0.40
Test set, RMSE=0.57


In [89]:
feature_importances = regressor.get_feature_importance()
print(feature_importances)

[25.96270007  5.99059475  3.54893413  2.06532971  0.51814647 16.59055898
  4.46396552  7.16208061 33.69768975]


In [90]:
X_test

Unnamed: 0,counter_id,site_id,latitude,longitude,year,month,day,weekday,hour
56474,100007049-102007049,100007049,-0.441261,0.799079,2021,8,10,1,5
56477,100007049-102007049,100007049,-0.441261,0.799079,2021,8,10,1,6
56480,100007049-102007049,100007049,-0.441261,0.799079,2021,8,10,1,7
56483,100007049-102007049,100007049,-0.441261,0.799079,2021,8,10,1,9
56486,100007049-102007049,100007049,-0.441261,0.799079,2021,8,10,1,10
...,...,...,...,...,...,...,...,...,...
929175,300014702-353245971,300014702,-0.779446,-1.125788,2021,9,9,3,6
929178,300014702-353245971,300014702,-0.779446,-1.125788,2021,9,9,3,10
929181,300014702-353245971,300014702,-0.779446,-1.125788,2021,9,9,3,15
929184,300014702-353245971,300014702,-0.779446,-1.125788,2021,9,9,3,22


Donc, visiblement, utiliser un standard scaler n'est pas vraiment une bonne idée (overfit)

# 10th version: deleting columns that do not have a big importance. (compared to v2)

n'a pas fonctionné. le v2 est toujours le meilleur.

In [113]:
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

In [114]:
# Define the test and train variables
_target_column_name = "log_bike_count"

y_train = df_train[_target_column_name]
y_test = df_test[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])

In [115]:
# Define the function that encodes dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


In [116]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count", "longitude", "latitude"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count", "longitude", "latitude"])

In [117]:
# Define the encoders we want to use
date_encoder = FunctionTransformer(_encode_dates)
date_cols = _encode_dates(X_train[["date"]]).columns.tolist()

categorical_cols = ["counter_id", "site_id"]

In [118]:
#Create our Pipeline
regressor = CatBoostRegressor(iterations= 1000, learning_rate = 0.2, cat_features=categorical_cols)

pipeline = make_pipeline(date_encoder, regressor)
pipeline.fit(X_train, y_train)

0:	learn: 1.4669612	total: 104ms	remaining: 1m 44s
1:	learn: 1.3113923	total: 190ms	remaining: 1m 34s
2:	learn: 1.1938014	total: 287ms	remaining: 1m 35s
3:	learn: 1.1003554	total: 374ms	remaining: 1m 33s
4:	learn: 1.0333988	total: 459ms	remaining: 1m 31s
5:	learn: 0.9764553	total: 576ms	remaining: 1m 35s
6:	learn: 0.9298467	total: 699ms	remaining: 1m 39s
7:	learn: 0.8937081	total: 833ms	remaining: 1m 43s
8:	learn: 0.8663164	total: 952ms	remaining: 1m 44s
9:	learn: 0.8452408	total: 1.06s	remaining: 1m 45s
10:	learn: 0.8274525	total: 1.16s	remaining: 1m 44s
11:	learn: 0.8145168	total: 1.29s	remaining: 1m 46s
12:	learn: 0.8021429	total: 1.43s	remaining: 1m 48s
13:	learn: 0.7911191	total: 1.6s	remaining: 1m 52s
14:	learn: 0.7838152	total: 1.69s	remaining: 1m 50s
15:	learn: 0.7770803	total: 1.81s	remaining: 1m 51s
16:	learn: 0.7718097	total: 1.9s	remaining: 1m 49s
17:	learn: 0.7661702	total: 2s	remaining: 1m 48s
18:	learn: 0.7599186	total: 2.1s	remaining: 1m 48s
19:	learn: 0.7538555	total: 

In [119]:
# Find the predictions
y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [120]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.4f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, pipeline.predict(X_test), squared=False):.4f}"
)

Train set, RMSE=0.4122
Test set, RMSE=0.4278


In [121]:
# Afficher l'importance des fonctionnalités
feature_importances = regressor.get_feature_importance()
print("Feature Importances:", feature_importances)

Feature Importances: [ 7.69975693 15.68012533 18.29196214  4.04072881  5.56687102 48.72055577]


In [122]:
date_encoder.fit_transform(X_test)

Unnamed: 0,counter_id,site_id,month,day,weekday,hour
56474,100007049-102007049,100007049,8,10,1,5
56477,100007049-102007049,100007049,8,10,1,6
56480,100007049-102007049,100007049,8,10,1,7
56483,100007049-102007049,100007049,8,10,1,9
56486,100007049-102007049,100007049,8,10,1,10
...,...,...,...,...,...,...
929175,300014702-353245971,300014702,9,9,3,6
929178,300014702-353245971,300014702,9,9,3,10
929181,300014702-353245971,300014702,9,9,3,15
929184,300014702-353245971,300014702,9,9,3,22


# 11th version : XG Boost

In [171]:
df_train = pd.read_parquet("../data/train.parquet")
df_test = pd.read_parquet("../data/test.parquet")

In [172]:
# Define the test and train variables
_target_column_name = "log_bike_count"

y_train = df_train[_target_column_name]
y_test = df_test[_target_column_name]
X_train = df_train.drop(columns=[_target_column_name])
X_test = df_test.drop(columns=[_target_column_name])

In [173]:
X_train = X_train.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])
X_test = X_test.drop(columns=["counter_name", "site_name", "counter_technical_id", "counter_installation_date", "bike_count"])

In [174]:
import xgboost as xgb
from xgboost import XGBRegressor

# Define the function that encodes dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally, we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

# Encode the dates
date_encoder = FunctionTransformer(_encode_dates)
X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

# Columns to be used in the model
selected_columns = ['counter_id', 'site_id', 'latitude', 'longitude', 'year', 'month', 'day', 'weekday', 'hour']

X_train_selected = X_train[selected_columns]
X_test_selected = X_test[selected_columns]


# Determine categorical columns
categorical_cols = ["counter_id", "site_id"]

X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

# Create our Pipeline
regressor = XGBRegressor(n_estimators=1000, learning_rate=0.2, enable_categorical=True)

regressor.fit(X_train_selected, y_train)


In [175]:
# Find the predictions
y_pred = regressor.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

In [176]:
# Print the train and test RMSE scores
print(
    f"Train set, RMSE={mean_squared_error(y_train, regressor.predict(X_train), squared=False):.4f}"
)
print(
    f"Test set, RMSE={mean_squared_error(y_test, regressor.predict(X_test), squared=False):.4f}"
)

Train set, RMSE=0.3208
Test set, RMSE=0.5288


Donc là le XG Boost permet un super train rmse mais pas mal d'overfit --> A creuser