In [19]:
import numpy as np
import pandas as pd

# Creating the Pre-Processor Pipeline

This Pipeline will have 3 steps:

- ColumnSelector: We select the 'counter_id' and 'date' features
- DateFormatter : Separate the datetime column into separate features.<br> We then drop the original 'date' column
- EncodeCounter: OneHotEncoding the 'counter_id' feature and then drop the original column.

In [88]:
from sklearn.base import BaseEstimator, TransformerMixin  
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

class ColumnSelector(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[['counter_id','date']]

class DateFormatter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'])
        X_copy['year'] = X_copy['date'].dt.year
        X_copy['month'] = X_copy['date'].dt.month
        X_copy['week'] = X_copy['date'].dt.isocalendar().week
        X_copy['day'] = X_copy['date'].dt.day
        X_copy['hour'] = X_copy['date'].dt.hour
        X_copy['minute'] = X_copy['date'].dt.minute
        X_copy.drop(columns='date', inplace=True)
        return X_copy


class EncodeCounter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy = pd.get_dummies(X_copy, columns=['counter_id'], dtype=int)
        return X_copy




preprocess = Pipeline([
    ("ColumnSelector", ColumnSelector()),
    ("DateFormatter", DateFormatter()),
    ("EncodeCounter", EncodeCounter())
])        
        


In [89]:

import os

df = pd.read_parquet(os.path.join("..", "Datasets", "train.parquet"))


In [95]:
X = preprocess.fit_transform(df)
y = df['log_bike_count']

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print the model coefficients and evaluation metrics
#print("Coefficients:", model.coef_)
#print("Intercept:", model.intercept_)
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 1.4367991504850186


Now, let us train on the entire training set, and predict on the test set

In [100]:
# Fit on the full train set:
X = preprocess.fit_transform(df)
y = df['log_bike_count']


# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X,y)


# Import test set
df_test = pd.read_parquet(os.path.join("..", "Datasets", "final_test.parquet"))

df_test = preprocess.fit_transform(df_test)
predictions = model.predict(df_test)

# Store predictions in pandas DataFrame:
predictions = pd.DataFrame({'Id': range(0, len(predictions)), 'log_bike_count': predictions})


# Specify the file path
csv_file_path = 'submission.csv'

# Write the DataFrame to a CSV file
predictions.to_csv(csv_file_path, index=False)
