In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score,TimeSeriesSplit
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.datasets import make_regression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import numpy as np
from datetime import datetime


def covid_scaled_count(X):
    X = X.copy()

    # Define the COVID impact factors
    covid_impact_conf = 1.23
    covid_impact_cv = 1.16

    # Define date ranges for confinements (conf) and couvre-feu (c_v)
    conf = [(datetime(2020, 10, 30), datetime(2020, 12, 15)), (datetime(2021, 4, 3), datetime(2021, 5, 2))]
    c_v = [(datetime(2020, 10, 15), datetime(2020, 10, 29)), (datetime(2020, 12, 16), datetime(2021, 4, 2)), (datetime(2021, 5, 3), datetime(2021, 6, 20))]

    # Initialize a column for scaled bike counts
    X['scaled_bike_count'] = X['bike_count']

    # Apply the conf impact factor
    for start, end in conf:
        conf_mask = (X['date'] >= start) & (X['date'] <= end)
        X.loc[conf_mask, 'scaled_bike_count'] *= covid_impact_conf

    # Apply the c_v impact factor
    for start, end in c_v:
        cv_mask = (X['date'] >= start) & (X['date'] <= end)
        X.loc[cv_mask, 'scaled_bike_count'] *= covid_impact_cv

    return X

def covid_scaled_log_count(X):
    X = X.copy()
    # Apply ln(x+1) transformation to the scaled_bike_count
    X['scaled_log_bike_count'] = np.log1p(X['scaled_bike_count'])
    return X


# Load data
df_train = pd.read_parquet('Data/train.parquet')


df_train['site_name'] = df_train['site_name'].astype(str)
df_train['site_name'] = df_train['site_name'].replace(r'^Pont des Invalides (S-N|N-S)$', 'Pont des Invalides', regex=True)
df_train['site_name'] = df_train['site_name'].astype('category')


### IMPLEMENT SCALED FACTOR FOR COVID on df_train
df_train = covid_scaled_count(df_train)
df_train = covid_scaled_log_count(df_train)


weather=pd.read_csv("external_data.csv",sep='or|,+', engine='python')
weather=weather[['date','t','ff','vv','u','rr1']]


new_name= {'t' : 'Temperature' , 'ff' : 'average_wind_speed','vv' : 'horizontal_visibility','rr1' : 'rain_l_1'}

weather.rename(columns=new_name,inplace=True)
weather['rain_l_1'].bfill(inplace=True)
weather['date']=pd.to_datetime(weather['date'])
# Define your date range
start_date = datetime.strptime('2020-09-01 01:00:00', "%Y-%m-%d %H:%M:%S")
end_date = datetime.strptime('2021-09-09 23:00:00', "%Y-%m-%d %H:%M:%S")
# Filter the DataFrame
weather_train = weather[(start_date <= weather['date']) & (weather['date'] <= end_date)]
weather_train = weather_train.copy()
weather_train.drop(2018, inplace=True)
weather_train = weather_train.set_index('date')
# Resample to hourly and forward fill the missing values
df_hourly = weather_train.resample('H').ffill()
# Reset index if you want 'date' back as a column
weather_hourly = df_hourly.reset_index()
df_train.reset_index(inplace=True)
df_train = pd.merge(df_train,weather_hourly,on='date' ) 
df_train.set_index('index',inplace=True)
df_train= df_train.sort_index()

df_train.head()
# Extract features and target
X_train = df_train.drop(columns=['log_bike_count', 'bike_count', 'scaled_bike_count', 'scaled_log_bike_count'])
y_train = df_train['scaled_log_bike_count']

X_train

Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,Temperature,average_wind_speed,horizontal_visibility,u,rain_l_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,283.95,1.1,25000,88,4000.0
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,283.95,1.1,25000,88,4000.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,293.65,4.0,30000,41,6.0
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,292.15,3.0,30000,47,6.0
48336,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 20:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,292.15,3.0,30000,47,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929169,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 01:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,294.05,4.4,15000,92,4200.0
929172,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 04:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,293.15,2.1,15000,94,1.0
929175,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 06:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,292.85,2.3,5000,95,1.0
929178,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-09-09 10:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,293.45,3.5,8000,91,1.0


In [2]:
def _encode_dates(X):
    X = X.copy()
    X['date'] = pd.to_datetime(X['date'])
    X['year'] = X['date'].dt.year
    X['month'] = X['date'].dt.month
    X['day'] = X['date'].dt.day
    X['weekday'] = X['date'].dt.weekday
    X['hour'] = X['date'].dt.hour
    return X

def column_to_drop(X):
    #return X.drop(['date'], axis=1)
    return X.drop(['date', 'longitude', 'latitude', 'counter_installation_date',
                   'counter_technical_id','counter_id', 'site_name', 'site_id'], axis=1)


def combined_transformer(X):
    X = _encode_dates(X)

    X = column_to_drop(X)
    print(X.info())
    return X

def get_estimator(model):
    data_encoder = FunctionTransformer(combined_transformer)

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ['coordinates','counter_name','month','year','day']

    preprocessor = ColumnTransformer(
        [
            ("cat", categorical_encoder, categorical_cols),
        ],remainder='passthrough'
    )
    
    
    regressor = model

    
    pipe = make_pipeline(data_encoder, preprocessor, regressor)
    
    return pipe


In [3]:
from sklearn.pipeline import make_pipeline

# Sample dataset - replace with your actual data


# Define your models here
models = {
    'LinearRegression': LinearRegression(),
    #'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor()
}

# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Dictionary to store the scores
model_scores = {}

# Loop through the models
for model_name, model in models.items():
    # Perform cross-validation
    scores = cross_val_score(get_estimator(model), X_train, y_train, scoring='neg_mean_squared_error', cv=tscv)

    # Store the average score
    model_scores[model_name] = np.mean(np.sqrt(-scores))

# Print the scores for each model
for model_name, score in model_scores.items():
    print(f"{model_name}: RMSE = {score:.3f}")


<class 'pandas.core.frame.DataFrame'>
Index: 82772 entries, 48324 to 268034
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   counter_name           82772 non-null  category
 1   coordinates            82772 non-null  category
 2   Temperature            82772 non-null  float64 
 3   average_wind_speed     82772 non-null  float64 
 4   horizontal_visibility  82772 non-null  int64   
 5   u                      82772 non-null  int64   
 6   rain_l_1               82772 non-null  float64 
 7   year                   82772 non-null  int32   
 8   month                  82772 non-null  int32   
 9   day                    82772 non-null  int32   
 10  weekday                82772 non-null  int32   
 11  hour                   82772 non-null  int32   
dtypes: category(2), float64(3), int32(5), int64(2)
memory usage: 5.5 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 82767 entries, 268035 to 3

<class 'pandas.core.frame.DataFrame'>
Index: 82767 entries, 797666 to 929181
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   counter_name           82767 non-null  category
 1   coordinates            82767 non-null  category
 2   Temperature            82767 non-null  float64 
 3   average_wind_speed     82767 non-null  float64 
 4   horizontal_visibility  82767 non-null  int64   
 5   u                      82767 non-null  int64   
 6   rain_l_1               82767 non-null  float64 
 7   year                   82767 non-null  int32   
 8   month                  82767 non-null  int32   
 9   day                    82767 non-null  int32   
 10  weekday                82767 non-null  int32   
 11  hour                   82767 non-null  int32   
dtypes: category(2), float64(3), int32(5), int64(2)
memory usage: 5.5 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 82772 entries, 48324 to 2

<class 'pandas.core.frame.DataFrame'>
Index: 413840 entries, 48324 to 797665
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   counter_name           413840 non-null  category
 1   coordinates            413840 non-null  category
 2   Temperature            413840 non-null  float64 
 3   average_wind_speed     413840 non-null  float64 
 4   horizontal_visibility  413840 non-null  int64   
 5   u                      413840 non-null  int64   
 6   rain_l_1               413840 non-null  float64 
 7   year                   413840 non-null  int32   
 8   month                  413840 non-null  int32   
 9   day                    413840 non-null  int32   
 10  weekday                413840 non-null  int32   
 11  hour                   413840 non-null  int32   
dtypes: category(2), float64(3), int32(5), int64(2)
memory usage: 27.6 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 82767 entr

<class 'pandas.core.frame.DataFrame'>
Index: 82767 entries, 706092 to 797665
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   counter_name           82767 non-null  category
 1   coordinates            82767 non-null  category
 2   Temperature            82767 non-null  float64 
 3   average_wind_speed     82767 non-null  float64 
 4   horizontal_visibility  82767 non-null  int64   
 5   u                      82767 non-null  int64   
 6   rain_l_1               82767 non-null  float64 
 7   year                   82767 non-null  int32   
 8   month                  82767 non-null  int32   
 9   day                    82767 non-null  int32   
 10  weekday                82767 non-null  int32   
 11  hour                   82767 non-null  int32   
dtypes: category(2), float64(3), int32(5), int64(2)
memory usage: 5.5 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 413840 entries, 48324 to 