-------------------------------------------------------------<br>
<b> BIKE SHARING DEMAND PREDICTION PROJECT <b><br>
End-to-end Python code: Data Preparation → Model → Evaluation<br>
-------------------------------------------------------------<br>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

-----------------------------------------
<h6>1. LOAD DATA </h6>
-----------------------------------------

In [18]:
df = pd.read_csv(r"C:\Users\abhishek.anand09@sap.com\OneDrive - SAP SE\Learning\IIITB_2025\day.csv")
print("Dataset Shape :", df.shape)
df.head()

Dataset Shape : (730, 16)


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    int64  
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.4+ KB


-----------------------------------------
<h5> 2. DATA PREPARATION </h5>
-----------------------------------------

<h6> Map season & weathersit to labels (Categorical) </h6>

In [20]:
season_map = {1: "spring", 2: "summer", 3: "fall", 4: "winter"}
weather_map = {
    1: "clear",
    2: "mist",
    3: "light_precipitation",
    4: "severe"
}

In [22]:
df["season_cat"] = df["season"].map(season_map)
df["weather_cat"] = df["weathersit"].map(weather_map)

In [23]:
# Drop unneeded or leakage columns
df = df.drop(columns=["instant", "dteday", "season", "weathersit",
                      "casual", "registered"])

In [24]:
# Dummy variables
df = pd.get_dummies(df,
                    columns=["season_cat", "weather_cat"],
                    drop_first=True)

-----------------------------------------
<h5> 3. SPLIT INTO X AND y </h5>
-----------------------------------------

In [25]:
y = df["cnt"]
X = df.drop(columns=["cnt"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("\nTrain/Test shapes:", X_train.shape, X_test.shape)


Train/Test shapes: (584, 14) (146, 14)


-----------------------------------------
<h5> 5. VIF – Multicollinearity Check </h5>
-----------------------------------------

In [52]:
# 1. Convert boolean columns to integer
bool_cols = X_train.select_dtypes(include=['bool']).columns
X_train[bool_cols] = X_train[bool_cols].astype(int)

# 2. Convert all values to numeric (force)
X_train = X_train.apply(pd.to_numeric, errors='coerce')

# 3. Drop any rows with NaN (rare, but avoids isfinite error)
X_train = X_train.dropna()

# 4. Ensure it's a numpy float array
X_np = X_train.astype(float).values


-----------------------------------------
<h5> 6. SKLEARN LINEAR REGRESSION (For Prediction) </h5>
-----------------------------------------

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("\nTrain R² :", lr.score(X_train, y_train))
print("Test  R² :", r2_score(y_test, y_pred))


Train R² : 0.8214651977454049
Test  R² : 0.8520828533826305


-----------------------------------------
<h5> 7. COEFFICIENT TABLE </h5>
-----------------------------------------

In [29]:
coef_df = pd.DataFrame({
    "Feature": ["Intercept"] + list(X_train.columns),
    "Coefficient": [lr.intercept_] + list(lr.coef_)
})

print("\nModel Coefficients:")
print(coef_df)


Model Coefficients:
                            Feature  Coefficient
0                         Intercept  2199.517447
1                                yr  1987.528872
2                              mnth   -28.498682
3                           holiday  -491.400139
4                           weekday    67.858245
5                        workingday   134.016348
6                              temp   100.044562
7                             atemp    23.174857
8                               hum   -10.606921
9                         windspeed   -41.981107
10                season_cat_spring  -977.844224
11                season_cat_summer   245.221851
12                season_cat_winter   802.730631
13  weather_cat_light_precipitation -2079.970126
14                 weather_cat_mist  -434.566939
