In [2]:
import numpy as np
import pandas as pd

# Creating the Pre-Processor Pipeline: Initial DataFrame

This Pipeline will have 4 steps:

- ColumnSelector: We select the 'counter_id' and 'date' features
- DateFormatter : Separate the datetime column into separate features.<br> We then drop the original 'date' column
- HolidaysFR: Adding a binary feature for french holiday dates and weekends
- EncodeCounter: OneHotEncoding the 'counter_id' feature and then drop the original column.

In [93]:
from sklearn.base import BaseEstimator, TransformerMixin  
#from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import holidays

class ColumnSelector(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[['counter_id','date']]

class DateFormatter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'])
        X_copy['year'] = X_copy['date'].dt.year
        X_copy['month'] = X_copy['date'].dt.month
        X_copy['week'] = X_copy['date'].dt.isocalendar().week
        X_copy['weekday'] = (X_copy['date'].dt.dayofweek + 1)
        X_copy['day'] = X_copy['date'].dt.day
        X_copy['hour'] = X_copy['date'].dt.hour
        #X_copy['minute'] = X_copy['date'].dt.minute  # Not relevant
        #X_copy.drop(columns='date', inplace=True)  # Will be dropped later, useful to keep to merge wither other dfs.
        return X_copy


class HolidaysFR(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        is_holiday = lambda date: 1 if date in holidays.US() else 0
        is_weekend = lambda day: 1 if day in (6,7) else 0
        X_copy = X.copy()
        X_copy['is_Holiday'] = X_copy['date'].apply(is_holiday)
        X_copy['is_Weekend'] = X_copy['weekday'].apply(is_weekend)
        X_copy.drop(columns='date', inplace=True)
        return X_copy

class EncodeCounter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy = pd.get_dummies(X_copy, columns=['counter_id'], dtype=int, drop_first=True)
        return X_copy
    
class MergeParisTemp(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        paris_temp = pd.read_csv(os.path.join("..", "Datasets", "meteo_paris_clean.csv"))
        X_copy['dateindex'] = X_copy['month'].astype(str) + X_copy['year'].astype(str) + X_copy['day'].astype(str)
        #merged_data = pd.merge(X_copy, paris_temp, how='left', on='dateindex')
        #merged_data.drop(columns='dateindex', inplace=True)
        return X_copy

preprocess = Pipeline([
    ("ColumnSelector", ColumnSelector()),
    ("DateFormatter", DateFormatter()),
    ("HolidaysFR", HolidaysFR()),
    #("EncodeCounter", EncodeCounter()),
    ("MergeParisTemp", MergeParisTemp())
])        
        


In [94]:

import os

df = pd.read_parquet(os.path.join("..", "Datasets", "train.parquet"))

X = preprocess.fit_transform(df)
y = df['log_bike_count']

In [95]:
X.head()

Unnamed: 0,counter_id,year,month,week,weekday,day,hour,is_Holiday,is_Weekend,dateindex
48321,100007049-102007049,2020,9,36,2,1,2,0,0,920201
48324,100007049-102007049,2020,9,36,2,1,3,0,0,920201
48327,100007049-102007049,2020,9,36,2,1,4,0,0,920201
48330,100007049-102007049,2020,9,36,2,1,15,0,0,920201
48333,100007049-102007049,2020,9,36,2,1,18,0,0,920201


In [75]:
y.tail()

929175    6.100319
929178    4.983607
929181    5.389072
929184    3.091042
929187    2.772589
Name: log_bike_count, dtype: float64

In [33]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Setting seed
np.random.seed(42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the model coefficients and evaluation metrics
#print("Coefficients:", model.coef_)
#print("Intercept:", model.intercept_)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 1.4309047221032432
[       nan        nan 0.34284475 0.28480679 0.24636465]


  print(np.sqrt(score))


Now, let us train on the entire training set, and predict on the test set

In [100]:
# Fit on the full train set:
X = preprocess.fit_transform(df)
y = df['log_bike_count']


# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X,y)


# Import test set
df_test = pd.read_parquet(os.path.join("..", "Datasets", "final_test.parquet"))

df_test = preprocess.fit_transform(df_test)
predictions = model.predict(df_test)

# Store predictions in pandas DataFrame:
predictions = pd.DataFrame({'Id': range(0, len(predictions)), 'log_bike_count': predictions})


# Specify the file path
csv_file_path = 'submission.csv'

# Write the DataFrame to a CSV file
predictions.to_csv(csv_file_path, index=False)


Adding more data to our prediction: Weather

The aim of the below section is to add the following features to our current model:
- Paris Precipitation Data based on Date:<br>This data was Encoded into 3 levels of precipitation<br>More details are available in the data_notes and data_preprocessing notebook.