In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

def _encode_dates(X):
    X = X.copy()
    X['date'] = pd.to_datetime(X['date'])
    X['year'] = X['date'].dt.year
    X['month'] = X['date'].dt.month
    X['day'] = X['date'].dt.day
    X['weekday'] = X['date'].dt.weekday
    X['hour'] = X['date'].dt.hour
    return X.drop(columns=['date'])

def get_estimator():
    date_encoder = FunctionTransformer(_encode_dates)
    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", "site_name"]

    preprocessor = ColumnTransformer(
        [
            ("date", date_encoder, ["date"]),
            ("cat", categorical_encoder, categorical_cols),
        ]
    )
    regressor = RandomForestRegressor(n_estimators=100, max_depth=7)
    # classifier = RandomForestClassifier()

    # pipe = make_pipeline(preprocessor, regressor)
    pipe =  make_pipeline(preprocessor, regressor)
    
    return pipe

# Load data
df_train = pd.read_parquet("data/train.parquet")

df_test = pd.read_parquet("data/test.parquet")

# Extract features and target
X_train = df_train.drop(columns=['log_bike_count'])
y_train = df_train['log_bike_count']

# Train the model
pipeline = get_estimator()
pipeline.fit(X_train, y_train)

# Predict on test data
X_test = df_test  
y_pred = pipeline.predict(X_test)

In [20]:
df_train.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,2.302585


In [21]:
encoded = _encode_dates(df_train)

In [22]:
encoded.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,counter_installation_date,counter_technical_id,latitude,longitude,log_bike_count,year,month,day,weekday,hour
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,2
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2013-01-18,Y2H15027244,48.846028,2.375429,0.693147,2020,9,1,1,3
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,4
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2013-01-18,Y2H15027244,48.846028,2.375429,1.609438,2020,9,1,1,15
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2013-01-18,Y2H15027244,48.846028,2.375429,2.302585,2020,9,1,1,18


In [23]:
def RMSE_score(y_train, y_pred):
    RMSE = np.sum(np.sqrt((y_train - y_pred)**2)/len(y_pred))
    return RMSE

y_test = df_test['log_bike_count']
RMSE = RMSE_score(y_test, y_pred)
print(RMSE)

0.6571261030458273
