In [12]:
#importing libraries for Data Manipulation and Machine Learning Model and Evaluation
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [4]:
#loading data from Excel File
try:
    df = pd.read_excel('day.xlsx')
except FileNotFoundError:
    print("Error: 'day.csv' not found. Please place the dataset file in the correct directory.")
    
if not df.empty:
    print(df.head()) # Gives first 5 entries 
    df.info() #provides Data Information 
    print(df.isnull().sum()) #counts no of Null entries 

   instant     dteday  season  yr  mnth  holiday  weekday  workingday  \
0        1 2011-01-01       1   0     1        0        6           0   
1        2 2011-01-02       1   0     1        0        0           0   
2        3 2011-01-03       1   0     1        0        1           1   
3        4 2011-01-04       1   0     1        0        2           1   
4        5 2011-01-05       1   0     1        0        3           1   

   weathersit      temp     atemp       hum  windspeed  casual  registered  \
0           2  0.344167  0.363625  0.805833   0.160446     331         654   
1           2  0.363478  0.353739  0.696087   0.248539     131         670   
2           1  0.196364  0.189405  0.437273   0.248309     120        1229   
3           1  0.200000  0.212122  0.590435   0.160296     108        1454   
4           1  0.226957  0.229270  0.436957   0.186900      82        1518   

    cnt  
0   985  
1   801  
2  1349  
3  1562  
4  1600  
<class 'pandas.core.frame.DataFr

In [5]:
if not df.empty: #checks if dataset is not empty
    #convert 'dteday' to datetime
    df['dteday'] = pd.to_datetime(df['dteday'])
    df_processed = df.drop(columns=['instant', 'dteday', 'casual', 'registered'])
    
    categorical_features = ['season', 'mnth', 'weekday', 'weathersit']
    df_processed = pd.get_dummies(df_processed, columns=categorical_features, drop_first=True)
    print(df_processed.columns.tolist())

['yr', 'holiday', 'workingday', 'temp', 'atemp', 'hum', 'windspeed', 'cnt', 'season_2', 'season_3', 'season_4', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weathersit_2', 'weathersit_3']


In [7]:
if not df_processed.empty:
    #defining features and target
    X = df_processed.drop('cnt', axis=1)
    y = df_processed['cnt']
    #scale the features 
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    #split data into two parts (80,20) for training and testing    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
if not df_processed.empty:
    #initialize the model
    lr_model= LinearRegression()
    #train the model
    lr_model.fit(X_train, y_train)
    #making the predcitions
    y_pred=lr_model.predict(X_test)

    #evaluating performance
    mae=mean_absolute_error(y_test, y_pred)
    mse=mean_squared_error(y_test, y_pred)
    r2=r2_score(y_test, y_pred)

    print("Linear Regression Model Results: ")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-Squared: {r2:.3f}")

Linear Regression Model Results: 
Mean Absolute Error: 584.85
Mean Squared Error: 647015.48
R-Squared: 0.839


In [11]:
#building model for two seperate Linear Reg Models 
if not df_processed.empty:
    y_casual = df['casual']
    y_registered = df['registered']

    X_train_b, X_test_b, y_casual_train, y_casual_test, y_reg_train, y_reg_test,y_total_train, y_total_test = \
        train_test_split(X_scaled, y_casual, y_registered, df['cnt'], test_size=0.2, random_state=42)
    
    #predict casual users 
    model_casual = LinearRegression()
    model_casual.fit(X_train_b, y_casual_train)
    y_pred_casual = model_casual.predict(X_test_b)

    #predict registered users
    model_registered = LinearRegression()
    model_registered.fit(X_train_b, y_reg_train)
    y_pred_registered = model_registered.predict(X_test_b)
    y_pred_total = np.maximum(0, y_pred_casual) + np.maximum(0, y_pred_registered)

    #combining the predictions and finding the final results 
    mae_combined = mean_absolute_error(y_total_test, y_pred_total)
    mse_combined = mean_squared_error(y_total_test, y_pred_total)
    r2_combined = r2_score(y_total_test, y_pred_total)

    print("\nPerformance of the Two-Model Approach :")
    print(f"Mean Absolute Error : {mae_combined:.2f}")
    print(f"Mean Squared Error : {mse_combined:.2f}")
    print(f"R-squared : {r2_combined:.4f}")


Performance of the Two-Model Approach :
Mean Absolute Error : 567.58
Mean Squared Error : 633101.86
R-squared : 0.8421
