
# Bike Demand Prediction - Multiple Linear Regression
This notebook reproduces the analysis performed: data loading, preprocessing, dummy encoding, VIF checks, backward elimination by p-value, final OLS model fit, diagnostics, and test-set evaluation. 


In [None]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', 200)


In [None]:

# Load dataset (ensure day.csv is in the same folder or adjust the path)
df = pd.read_csv('day.csv')
df['dteday'] = pd.to_datetime(df['dteday'], format='%d-%m-%Y')
df.head()


In [None]:

# Drop serial index and map categorical labels
df = df.drop(columns=['instant'])
season_map = {1:'spring', 2:'summer', 3:'fall', 4:'winter'}
weather_map = {1:'clear', 2:'cloudy', 3:'light_rain', 4:'heavy_rain'}
df['season'] = df['season'].map(season_map)
df['weathersit'] = df['weathersit'].map(lambda x: weather_map.get(x, 'other'))
df['yr'] = df['yr'].astype(int)
df['holiday'] = df['holiday'].astype(int)
df['workingday'] = df['workingday'].astype(int)
df['weekday'] = df['weekday'].astype(str)
df['mnth'] = df['mnth'].astype(str)
df.shape


In [None]:

# Create dummy variables (drop_first=True)
cat_cols = ['season','weathersit','weekday','mnth']
df_dummies = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Drop columns not used for modelling
df_dummies = df_dummies.drop(columns=['casual','registered','dteday'])

# Optionally drop atemp due to collinearity with temp
if 'atemp' in df_dummies.columns:
    df_dummies = df_dummies.drop(columns=['atemp'])

df_dummies.shape


In [None]:

# Prepare X and y, then train/test split
y = df_dummies['cnt']
X = df_dummies.drop(columns=['cnt'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape


In [None]:

def compute_vif(X_df):
    vif_data = pd.DataFrame()
    vif_data['feature'] = X_df.columns
    vif_data['VIF'] = [variance_inflation_factor(X_df.values, i) for i in range(X_df.shape[1])]
    return vif_data.sort_values('VIF', ascending=False).reset_index(drop=True)

vif_initial = compute_vif(X_train)
vif_initial.head(20)


In [None]:

# Backward elimination based on p-value (train set)
def backward_elimination(X, y, sl=0.05):
    X_curr = X.copy()
    while True:
        X_sm = sm.add_constant(X_curr)
        model = sm.OLS(y, X_sm).fit()
        pvals = model.pvalues.drop('const')
        max_p = pvals.max()
        if max_p > sl:
            remove_feat = pvals.idxmax()
            print(f'Removing {remove_feat} with p-value {max_p:.4f}')
            X_curr = X_curr.drop(columns=[remove_feat])
        else:
            break
    return model, X_curr

model_be, X_train_selected = backward_elimination(X_train, y_train, sl=0.05)
print('\nSelected features count:', X_train_selected.shape[1])


In [None]:

vif_selected = compute_vif(X_train_selected)
vif_selected.head(40)


In [None]:

# Evaluate on test set
X_test_selected = X_test[X_train_selected.columns]
X_test_selected_sm = sm.add_constant(X_test_selected)
y_pred_test = model_be.predict(X_test_selected_sm)
test_r2 = r2_score(y_test, y_pred_test)
print(f'Test R-squared: {test_r2:.4f}')


In [None]:

# Residuals and plots (train)
residuals = model_be.resid
fitted = model_be.fittedvalues

plt.figure(figsize=(6,4))
plt.hist(residuals, bins=30)
plt.title('Histogram of Residuals (Train)')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(fitted, residuals, s=10)
plt.axhline(0, color='k', linewidth=0.8)
plt.title('Residuals vs Fitted (Train)')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.grid(True)
plt.show()

import statsmodels.api as sm
fig = sm.qqplot(residuals, line='45', fit=True)
plt.title('Q-Q plot of Residuals (Train)')
plt.show()
