In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

prng = np.random.RandomState(20250317)

%precision 3
pd.set_option('display.precision', 3)
np.set_printoptions(suppress=True, precision=3)

# Predict the demand for bike share using linear models

Our goal is to predict demand for bike share based on [this](https://www.kaggle.com/c/bike-sharing-demand) Kaggle task.
Kaggle provides two data sets: a labelled train data and an unlabelled test data.
We have to use the train data to predict labels for the test data. The data consists of hourly rental data spanning two years. The training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month.
Kaggle won't give us the labels just a score we achieved on the test set.


### Know your data

In [None]:
bike_data = pd.read_csv("https://raw.githubusercontent.com/divenyijanos/ceu-ml/2025/data/bike_sharing_demand/bike_sample.csv")
bike_data.head()

In [None]:
bike_data.describe()

In [None]:
bike_data.shape

In [None]:
bike_data.isnull().sum()

In [None]:
bike_data['datetime'] = pd.to_datetime(bike_data['datetime'])
bike_2011 = bike_data[bike_data['datetime'].dt.year == 2011]
daily_counts = bike_2011.groupby(bike_2011['datetime'].dt.date)['count'].sum()
dates = daily_counts.index
counts = daily_counts.values

plt.bar(dates, counts, color='darkblue')
plt.xlabel("Date")
plt.ylabel("Rental Count")
plt.title("Daily Rentals for 2011")
plt.xticks(rotation=45)
plt.show()

### Train-test split

In [7]:
# train-test split keeping numeric features

features = bike_data.drop(columns=['count']).select_dtypes(include=np.number)
label = bike_data['count']
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=prng)

In [None]:
# choose a better train-test split

train_indices = pd.to_datetime(bike_data['datetime']).dt.day <= 15
X_train = features[train_indices]
X_test = features[~train_indices]
y_train = label[train_indices]
y_test = label[~train_indices]

print(f"Resulting size of the test is: {y_test.shape[0] / bike_data.shape[0]:.2%}")


### Evaluation function

**TODO**: Write a loss function that calculates the Root Mean Squared Log Error (RMSLE).

In [9]:
# define a loss function 
def calculateRMSLE(prediction, y_obs):
    # TBA

### Benchmark

**TODO**: Estimate a _very_ simple benchmark model (average), and evaluate its performance on both the train and the test set.

In [10]:
# estimate benchmark model
benchmark = # TBA

In [11]:
# Helper to keep track of the results
class ResultCollector:
    def __init__(self):
        self.results = {}
        
    def add_model(self, name, train_error, test_error):
        """Add or update a model's results."""
        self.results[name] = {
            'Train RMSLE': train_error,
            'Test RMSLE': test_error
        }
        return self.get_table()
    
    def get_table(self, style=True):
        """Get the results table with optional styling."""
        df = pd.DataFrame(self.results).T
        if style:
            return df.style.format("{:.3f}").background_gradient(cmap='RdYlGn_r', axis=None)
        return df

In [None]:
results = ResultCollector()
results.add_model("Benchmark", calculateRMSLE(benchmark, y_train), calculateRMSLE(benchmark, y_test))

### Model #1: Group averages

#### Statistics recap: linear regression and averages

If you fit a linear regression model using a **dummy variable** without any other features, the model will predict the average outcome for the groups represented by the dummy variables. A binary variable with numeric values 0 and 1 behaves as a dummy variable without any further transformation. The estimated intercept of the linear regression captures the average of the reference category (for which the binary variable takes the value of 0) while the coefficient expressed the difference between the averages of the categories.

In [None]:
# illustration on a single binary variable
lin_reg_simple_dummy = LinearRegression().fit(X_train[['holiday']], y_train)
prediction = lin_reg_simple_dummy.predict(X_train[['holiday']])

joint_data = pd.DataFrame({
    'dummy': X_train['holiday'].values,
    'y': y_train,
    'prediction': prediction
})
joint_data.groupby('dummy').agg({'y': 'mean', 'prediction': 'mean'})

In [None]:
# Compare it to estimated coefficients
[lin_reg_simple_dummy.intercept_, lin_reg_simple_dummy.intercept_ + lin_reg_simple_dummy.coef_[0]]

For a categorical variable with **multiple categories**, we have to ensure to represent each category level by a dummy variable (called one-hot encoding achieved by `pd.get_dummies()` or `OneHotEncoder`).
To avoid multicollinearity issues, you need to omit one of the dummy variables. Including all of them in the model would lead to redundant information because the value of one dummy variable can be predicted from the values of the other dummy variables. By omitting one dummy variable, you set it as the reference category, and the coefficients of the remaining dummy variables represent the difference in the outcome variable between each category and the reference category. With the constant term, you estimate as many coefficients as there are category levels.

In [None]:
# illustration on a multi-level categorical variable

one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')

lin_reg_multicategory_dummy = Pipeline([
    ('dummify', one_hot_encoder),
    ('ols', LinearRegression())
])

lin_reg_multicategory_dummy.fit(X_train[['season']], y_train)
prediction = lin_reg_multicategory_dummy.predict(X_train[['season']])

joint_data = pd.DataFrame({
    'season': X_train['season'].values,
    'y': y_train,
    'prediction': prediction
})

joint_data.groupby('season').agg({'y': 'mean', 'prediction': 'mean'})

In [None]:
# check the data which is used for fitting
lin_reg_multicategory_dummy['dummify'].fit_transform(X_train[['season']])

In [None]:
# Compare it to estimated coefficients
lm_model = lin_reg_multicategory_dummy['ols']
[lm_model.intercept_] + [lm_model.intercept_ + coef for coef in lm_model.coef_]

If you have **multiple categorical variables**, life gets complicated. To ensure that you estimate as many coefficients as there are combinations of categories to capture the average of each group, we also need to include the _interaction_ of the dummies. You can achieve this with `PolynomialFeatures`. 

In [None]:
# Illustration on multiple categorical variables

create_interactions = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)

lin_reg_complex_dummies =Pipeline([
    ('dummify', one_hot_encoder),
    ('create_interactions', create_interactions),
    ('ols', LinearRegression())
])

lin_reg_complex_dummies.fit(X_train[['season', 'workingday']], y_train)
prediction = lin_reg_complex_dummies.predict(X_train[['season', 'workingday']])

joint_data = pd.concat([
    X_train[['season', 'workingday']],
    pd.DataFrame({
        'y': y_train,
        'prediction': prediction
    })
], axis=1)
joint_data.groupby(['workingday', 'season']).agg({'y': 'mean', 'prediction': 'mean'})

It indeed predicts group averages.

In [None]:
# We have more coefficients than the number of combinations of categories -- some of the coffients are zero (the overall effect is the same as the intercept)
[lin_reg_complex_dummies['ols'].intercept_] + [lin_reg_complex_dummies['ols'].intercept_ + coef for coef in lin_reg_complex_dummies['ols'].coef_]



However, if some of your categorical variables are multilevel, the interaction of these dummies within the same category (e.g. `season_2 * season_3`) will be constant zero, and you will get a linearly-dependent (rank-deficient) feature matrix. The optimization algorithm of the `LinearRegression` will give you a solution anyway estimating 0-s for the corresponding coefficients. It might be better to exclude variables with zero variance right away using the `VarianceThreshold` method (which defaults to the threshold of zero variance).

In [None]:
lin_reg_complex_dummies = Pipeline([
    ('dummify', one_hot_encoder),
    ('create_interactions', create_interactions),
    ('drop_zero_variance', VarianceThreshold()),
    ('ols', LinearRegression())
])

lin_reg_complex_dummies.fit(X_train[['season', 'workingday']], y_train)
prediction = lin_reg_complex_dummies.predict(X_train[['season', 'workingday']])

joint_data = pd.concat([
    X_train[['season', 'workingday']],
    pd.DataFrame({
        'y': y_train,
        'prediction': prediction
    })
], axis=1)
joint_data.groupby(['workingday', 'season']).agg({'y': 'mean', 'prediction': 'mean'})

#### Technical detour: feature transformation within Pipeline

If you want to apply specific transformations on some columns, you can use `ColumnTransformer`. A `ColumnTransformer` takes a list of transformation and some optional parameters (like what to do with the `remainder` columns that were not specified in the transformation steps; defaults to `"drop"`, change to `"passthrough"` if you want to keep them). Each transformation consists of a three-element tuple: a name (you name it), the operation, and the columns the operation needs to be applied (given by the list of name, indices, etc.).

While `Pipeline` executes the steps sequentially, `ColumnTransformer` applies the listed operations at once on different sets of columns.

See [this Medium post](https://towardsdatascience.com/simplifying-machine-learning-model-development-with-columntransformer-pipeline-f09ffb04ca6b) for a more detailed explanation.

Here we would like to apply multiple steps sequentially on a specific set of the original columns in our training data, so we will pass a `Pipeline` to the `ColumnTransformer`. Then, we we would like to estimate a model, so we build another `Pipeline` for the whole process.

In [None]:
dummy_features = ["season", "workingday"]

one_hot_encoder = OneHotEncoder(sparse_output=False, drop="first")

create_categorical_features = Pipeline([
    ("dummify", one_hot_encoder),
    ("create_interactions", create_interactions),
    ("drop_zero_variance", VarianceThreshold())
])

pipe_whole_process = Pipeline([
    ("create_features", ColumnTransformer([("choose_and_transform_features", create_categorical_features, dummy_features)])),
    ("ols", LinearRegression())
])
pipe_whole_process

In [None]:
pipe_whole_process.fit(X_train, y_train)  # note that we fit the whole X_train, not just the selected columns, as the ColumnTransformer will do the selection

In [None]:
# Double-check we got the same result
(
    calculateRMSLE(lin_reg_complex_dummies.predict(X_train[['season', 'workingday']]), y_train),
    calculateRMSLE(pipe_whole_process.predict(X_train), y_train),
)

#### Estimate model #1: group averages by weather, workingday and holiday

In [None]:
# estimate & evaluate model #1
dummy_features = ["season", "workingday", "holiday"]

steps = [
    ("create_features", ColumnTransformer([("choose_and_transform_features", create_categorical_features, dummy_features)])),
    ("ols", LinearRegression())
]

pipe_group_avg = Pipeline(steps)
pipe_group_avg.fit(X_train, y_train)


train_error = calculateRMSLE(pipe_group_avg.predict(X_train), y_train)
test_error = calculateRMSLE(pipe_group_avg.predict(X_test), y_test)

results.add_model("Group averages", train_error, test_error)

### Model #2: Group averages with weather

In [None]:
# Model #2: Group averages with weather
dummy_features = ['season', 'holiday', 'workingday', 'weather']
numeric_features = ['temp', 'atemp', 'humidity', 'windspeed']

steps = [
    ("create_features", ColumnTransformer([
        ("choose_and_transform_dummy_features", create_categorical_features, dummy_features),
        ("keep_numeric_features", "passthrough", numeric_features)
    ])),
    ("ols", LinearRegression())
]

pipe_group_avg_with_weather = Pipeline(steps)
pipe_group_avg_with_weather.fit(X_train, y_train)

In [None]:
results.add_model(
    'Group avgs with weather',
    calculateRMSLE(pipe_group_avg_with_weather.predict(X_train), y_train),
    calculateRMSLE(pipe_group_avg_with_weather.predict(X_test), y_test)
)

### Model #3: Very flexible linear with polynomial features

In [None]:
steps = [
    ("create_features", ColumnTransformer([
        ("create_dummy_features", one_hot_encoder, dummy_features), # we will create interactions later across all features
        ("keep_numeric_features", "passthrough", numeric_features)
    ])),
    ("4_degree_poly", PolynomialFeatures(degree=4, include_bias=False)),
    ("drop_zero_variance", VarianceThreshold()),
    ("ols", LinearRegression())
]

pipe_flexible_linear = Pipeline(steps)
pipe_flexible_linear

In [None]:
pipe_flexible_linear.fit(X_train, y_train)
train_error = calculateRMSLE(pipe_flexible_linear.predict(X_train), y_train)
test_error = calculateRMSLE(pipe_flexible_linear.predict(X_test), y_test)

results.add_model("Flexible linear", train_error, test_error)


### Model #4: Improve with Lasso

**TODO**: Improve Model#3 by estimating a cross-validated Lasso on the expanded (flexible) dataset.

In [None]:
# Model #4: improve with Lasso
steps = [
    # TBA
]
pipe_lasso = Pipeline(steps)

pipe_lasso.fit(X_train, y_train)

In [None]:
train_error = calculateRMSLE(pipe_lasso.predict(X_train), y_train)
test_error = calculateRMSLE(pipe_lasso.predict(X_test), y_test)

results.add_model("Flexible LASSO", train_error, test_error)

**Lessons:**

- Gradually adding more information present in the training improves our models' performance.
- Being very flexible without any penalty for the complexity leads to overfitting (test error >> train error).
- Choosing a method with regularization (and tune the hyperparameter automatically by `LassoCV`) is able to exploit flexibility without overfitting - however, the performance gain of flexibility is small.

## Improve the models

### Diagnostics

In [None]:
linear_predictions = pipe_group_avg_with_weather.predict(X_test)
lasso_predictions = pipe_lasso.predict(X_test)

plt.scatter(y_test, linear_predictions, label='Linear', alpha=0.5)
plt.scatter(y_test, lasso_predictions, label='Lasso', alpha=0.5)
plt.axline((1, 1), slope=1, linestyle='dashed', color='red')
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.legend()

In [None]:
bike_data[bike_data['count'] < 10]

### Feature engineering

In [33]:
def extractDtFeatures(df_with_datetime):
    df_with_datetime['datetime'] = pd.to_datetime(df_with_datetime['datetime'], utc=True)
    df_with_datetime['year'] = df_with_datetime['datetime'].dt.year
    df_with_datetime['month'] = df_with_datetime['datetime'].dt.month
    df_with_datetime['hour'] = df_with_datetime['datetime'].dt.hour
    df_with_datetime['dayofweek'] = df_with_datetime['datetime'].dt.dayofweek


extractDtFeatures(bike_data)

In [34]:
feature_matrix = bike_data.drop(columns=["count", "registered", "casual"]).select_dtypes(include=np.number)

X_train_fe = feature_matrix[train_indices]
X_test_fe = feature_matrix[~train_indices]

#### Linear (FE)

We created many new categorical variables. Creating all the interactions would mean lots of parameters and we would be back in the "very flexible" scenario. Let's estimate a simpler linear model instead, where we only include the dummy variables but not their interactions.

In [None]:
dummy_features = ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'hour', 'dayofweek']

steps = [
    ("create_features", ColumnTransformer([
        ("create_dummies", one_hot_encoder, dummy_features),
        ("keep_numeric_features", "passthrough", numeric_features)
    ])),
    ("ols", LinearRegression())
]

pipe_linear = Pipeline(steps)
pipe_linear.fit(X_train_fe, y_train)

In [None]:
train_error = calculateRMSLE(pipe_linear.predict(X_train_fe), y_train)
test_error = calculateRMSLE(pipe_linear.predict(X_test_fe), y_test)

results.add_model("Feature engineered linear", train_error, test_error)


#### Lasso (FE)

In [None]:
# Lasso
steps = [
    ("dummify_selected_columns", ColumnTransformer([
        ("dummify", one_hot_encoder, dummy_features),
        ("scale", StandardScaler(), numeric_features)
    ])),
    ("2_degree_poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lasso", LassoCV(random_state=prng))
]
pipe_lasso = Pipeline(steps)
pipe_lasso.fit(X_train_fe, y_train)

In [None]:
train_error = calculateRMSLE(pipe_lasso.predict(X_train_fe), y_train)
test_error = calculateRMSLE(pipe_lasso.predict(X_test_fe), y_test)

results.add_model("Feature engineered Lasso", train_error, test_error)


**Lessons:**

- You should always look for more information hidden in your data.
- Extracting information hidden in the non-numeric `datetime` column resulted in a huge improvement for both OLS and Lasso.

### Collect more data

In [None]:
bike_full = pd.read_csv("https://raw.githubusercontent.com/divenyijanos/ceu-ml/2023/data/bike_sharing_demand/train.csv")
bike_full.shape

In [None]:
bike_data.shape

In [41]:
# feature engineering
extractDtFeatures(bike_full)

In [None]:
bike_full.isnull().sum()

In [None]:
# Ensure the test set remains intact -> all the new data goes into the training set
full_data_without_original_test = bike_full.loc[~bike_full.datetime.isin(bike_data.filter(X_test.index, axis=0)['datetime'])]
full_data_without_original_test.shape

In [44]:
X_full = full_data_without_original_test.drop(columns=["count", "registered", "casual", "datetime"])
y_full = full_data_without_original_test['count']

#### Group averages with weather (full data)

In [None]:

pipe_group_avg_with_weather.fit(X_full, y_full)

train_error = calculateRMSLE(pipe_group_avg_with_weather.predict(X_full), y_full)
test_error = calculateRMSLE(pipe_group_avg_with_weather.predict(X_test), y_test)

results.add_model("Group avgs with weather large n", train_error, test_error)


#### Linear model (FE, full)

In [None]:

pipe_linear.fit(X_full, y_full)

In [None]:
# evaluate
train_error = calculateRMSLE(pipe_linear.predict(X_full), y_full)
test_error = calculateRMSLE(pipe_linear.predict(X_test_fe), y_test)

results.add_model("Feature engineered linear large n", train_error, test_error)


#### Lasso (FE, Full)

In [None]:

pipe_lasso.fit(X_full, y_full)

In [None]:
# evaluate
train_error = calculateRMSLE(pipe_lasso.predict(X_full), y_full)
test_error = calculateRMSLE(pipe_lasso.predict(X_test_fe), y_test)

results.add_model("Feature engineered Lasso large n", train_error, test_error)

**Lessons:**

- Collecting more samples from the same domain could help
- But only if the model is flexible enough to capture new (more subtle) patterns. A simple average is usually stable enough once you have 20 observations so collecting more won't have much impact. However, models that allow for complexity, such as the lasso on complex transformations, could benefit from the new set of training data.