In [1]:
import numpy as np
import pandas as pd

# Creating the Pre-Processor Pipeline: Initial DataFrame

This Pipeline will have 4 steps:

- ColumnSelector: We select the 'counter_id' and 'date' features
- DateFormatter : Separate the datetime column into separate features.<br> We then drop the original 'date' column
- HolidaysFR: Adding a binary feature for french holiday dates and weekends
- EncodeCounter: OneHotEncoding the 'counter_id' feature and then drop the original column.
- MergeWeather: Merging weather data from Orly and Paris

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin  
#from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import holidays


class ColumnSelector(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[['counter_id','date']]


class DateFormatter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'])
        X_copy['year'] = X_copy['date'].dt.year
        X_copy['month'] = X_copy['date'].dt.month
        #X_copy['week'] = X_copy['date'].dt.isocalendar().week
        X_copy['weekday'] = (X_copy['date'].dt.dayofweek + 1)
        #X_copy['day'] = X_copy['date'].dt.day
        X_copy['hour'] = X_copy['date'].dt.hour
        #X_copy['minute'] = X_copy['date'].dt.minute  # Not relevant
        #X_copy.drop(columns='date', inplace=True)  # Will be dropped later, useful to keep to merge wither other dfs.
        return X_copy


class HolidaysFR(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        is_holiday = lambda date: 1 if date in holidays.US() else 0
        is_weekend = lambda day: 1 if day in (6,7) else 0
        X_copy = X.copy()
        X_copy['is_Holiday'] = X_copy['date'].apply(is_holiday)
        X_copy['is_Weekend'] = X_copy['weekday'].apply(is_weekend)
        #X_copy.drop(columns='date', inplace=True)
        return X_copy


class EncodeCounter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy = pd.get_dummies(X_copy, columns=['counter_id'], dtype=int, drop_first=True)
        return X_copy


class MergeWeather(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        data = pd.read_csv(os.path.join("..", "Datasets", "weather_data_cleaned"))
        data['date'] = pd.to_datetime(data['date']).astype('datetime64[us]')
        merged_data = pd.merge_asof(X, data, on='date')
        merged_data.drop(columns='date', inplace=True)
        return merged_data


preprocess = Pipeline([
    ("ColumnSelector", ColumnSelector()),
    ("DateFormatter", DateFormatter()),
    ("HolidaysFR", HolidaysFR()),
    ("EncodeCounter", EncodeCounter()),
    ("MergeWeather", MergeWeather())
])        
        


We now import and process the dataset.<br>Note: We sort the data by date before separating the target variable. This is required to merge the weather data using pd.merge.asof() to the nearest datetime value later on

In [3]:
import os

df = pd.read_parquet(os.path.join("..", "Datasets", "train.parquet"))
df = df.sort_values('date') # Sort by date 

In [4]:
X = preprocess.fit_transform(df)
y = df['log_bike_count']

In [33]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
#from sklearn.model_selection import cross_val_score

# Setting seed
np.random.seed(42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the model coefficients and evaluation metrics
#print("Coefficients:", model.coef_)
#print("Intercept:", model.intercept_)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 1.3765462951740082


Now, let us train on the entire training set, and predict on the test set

In [6]:
# Fit on the full train set:
X = preprocess.fit_transform(df)
y = df['log_bike_count']

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X,y)

# Import test set
df_test = pd.read_parquet(os.path.join("..", "Datasets", "final_test.parquet"))
df_test = df_test.sort_values('date') # Sort by date 
new_order = df_test.index.tolist() #Keep index order


df_test = preprocess.transform(df_test)
predictions = model.predict(df_test)
predictions_df = pd.DataFrame({'Id': new_order, 'log_bike_count': predictions})
predictions_df = predictions_df.sort_values('Id')

# Specify the file path
csv_file_path = 'submission.csv'

# Write the DataFrame to a CSV file
predictions_df.to_csv(csv_file_path, index=False)
