In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score


df = pd.read_csv("SeoulBikeDataClean.csv")

# Time-based train/val/test split
train_df = df[df['datetime'] < '2018-10-01']
val_df   = df[(df['datetime'] >= '2018-10-01') & (df['datetime'] < '2018-11-01')]
test_df  = df[df['datetime'] >= '2018-11-01']

print("Train:", train_df.shape)
print("Val:  ", val_df.shape)
print("Test: ", test_df.shape)

Train: (6984, 40)
Val:   (665, 40)
Test:  (648, 40)


In [2]:
full_features = ['temperaturec', 'humiditypct',
       'wind_speed_m_s', 'visibility_10m', 'dew_point_temperaturec',
       'solar_radiation_mj_m2', 'Autumn',
       'Spring', 'Summer', 'Winter', 'phase0to6', 'phase7to9', 'phase10to17', 'phase18to19', 'phase20to24', 'is_raining',
       'is_snowing', 'temperaturec_squared', 'is_weekend', 'is_holiday',
       'is_working_day', 'temp_rain', 'temp_snow', 'humid_rain', 'lag_1',
       'lag_24', 'rolling_24_mean', 'rolling_168_mean']

X_train = train_df[full_features]
y_train = train_df['rented_bike_count']

X_test = train_df[full_features]
y_test = train_df['rented_bike_count']

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Use LassoCV to find the best alpha value using cross-validation
#use logspace for the alphas; use an intervept
lasso_cv = LassoCV(alphas=np.logspace(-10, 1, 50), cv=10, fit_intercept=True)  # Adjust the range of alphas as needed
lasso_cv.fit(X_train_scaled, y_train)

# Train Lasso regression with the best alpha
print("Best alpha: ", lasso_cv.alpha_)
y_pred_train = lasso_cv.predict(X_train_scaled)

# Print the coefficients
coefficients = pd.Series(lasso_cv.coef_, index=X_train.columns)
print(coefficients)

# Calculate R^2 for the training set
r2_train = r2_score(y_train, y_pred_train)
print(f"Lasso R^2 (Training): {r2_train}")

# Calculate OSR^2 for the test set
X_test_scaled = scaler.transform(X_test)
y_pred = lasso_cv.predict(X_test_scaled)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Lasso Train RMSE: {rmse:.2f}")
print(f"Lasso Train R²: {r2:.4f}")

Best alpha:  5.963623316594637
temperaturec               30.329883
humiditypct                -6.319734
wind_speed_m_s              0.179294
visibility_10m              9.727239
dew_point_temperaturec      0.000000
solar_radiation_mj_m2      24.710575
Autumn                      0.000000
Spring                      0.000000
Summer                     -0.000000
Winter                     -5.993087
phase0to6                 -23.043572
phase7to9                  24.284812
phase10to17                 0.000000
phase18to19                20.784086
phase20to24               -14.862558
is_raining                -19.183283
is_snowing                 -0.000000
temperaturec_squared        0.000000
is_weekend                -12.836742
is_holiday                 -3.109217
is_working_day              0.069428
temp_rain                 -22.598147
temp_snow                   0.000000
humid_rain                 -0.000000
lag_1                     444.129938
lag_24                    145.229145
rolling

In [3]:
ridge_cv = RidgeCV(alphas=np.logspace(-10, 1, 50), fit_intercept=True, cv=None)
ridge_cv.fit(X_train_scaled, y_train)

print("Best alpha: ", ridge_cv.alpha_)
y_pred_train = ridge_cv.predict(X_train_scaled)

# Print the coefficients
coefficients = pd.Series(ridge_cv.coef_, index=X_train.columns)
print(coefficients)

# Calculate R^2 for the training set
r2_train = r2_score(y_train, y_pred_train)
print(f"Ridge R^2 (Training): {r2_train}")

# Calculate OSR^2 for the test set
X_test_scaled = scaler.transform(X_test)
y_pred = ridge_cv.predict(X_test_scaled)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Ridge Train RMSE: {rmse:.2f}")
print(f"Ridge Train R²: {r2:.4f}")

Best alpha:  10.0
temperaturec               62.963454
humiditypct               -20.943122
wind_speed_m_s              2.013855
visibility_10m              8.901012
dew_point_temperaturec     -7.365433
solar_radiation_mj_m2      19.158026
Autumn                      4.770831
Spring                      5.293586
Summer                     -3.050991
Winter                     -5.290784
phase0to6                 -19.699467
phase7to9                  32.955866
phase10to17                -1.619421
phase18to19                22.320421
phase20to24               -19.724190
is_raining                -16.190802
is_snowing                  3.448656
temperaturec_squared       -5.421824
is_weekend                 -9.956622
is_holiday                -10.485700
is_working_day              9.956622
temp_rain                 -30.203749
temp_snow                   2.172125
humid_rain                 -7.000666
lag_1                     440.569473
lag_24                    155.488381
rolling_24_mean     