In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import statsmodels
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import calendar
import warnings
warnings.filterwarnings('ignore')

In [2]:
bike_df_ref = pd.read_csv('C:/Asheesh/upgrade/bikeRent/day.csv')
bike_df_ref.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [3]:
bike_df_ref.shape

(730, 16)

In [4]:
bike_df_ref.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0
mean,365.5,2.49863,0.5,6.526027,0.028767,2.99726,0.683562,1.394521,20.319259,23.726322,62.765175,12.76362,849.249315,3658.757534,4508.006849
std,210.877136,1.110184,0.500343,3.450215,0.167266,2.006161,0.465405,0.544807,7.506729,8.150308,14.237589,5.195841,686.479875,1559.758728,1936.011647
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.424346,3.95348,0.0,1.500244,2.0,20.0,22.0
25%,183.25,2.0,0.0,4.0,0.0,1.0,0.0,1.0,13.811885,16.889713,52.0,9.04165,316.25,2502.25,3169.75
50%,365.5,3.0,0.5,7.0,0.0,3.0,1.0,1.0,20.465826,24.368225,62.625,12.125325,717.0,3664.5,4548.5
75%,547.75,3.0,1.0,10.0,0.0,5.0,1.0,2.0,26.880615,30.445775,72.989575,15.625589,1096.5,4783.25,5966.0
max,730.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,35.328347,42.0448,97.25,34.000021,3410.0,6946.0,8714.0


In [5]:
bike_df_ref.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     730 non-null    int64  
 1   dteday      730 non-null    object 
 2   season      730 non-null    int64  
 3   yr          730 non-null    int64  
 4   mnth        730 non-null    int64  
 5   holiday     730 non-null    int64  
 6   weekday     730 non-null    int64  
 7   workingday  730 non-null    int64  
 8   weathersit  730 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         730 non-null    float64
 12  windspeed   730 non-null    float64
 13  casual      730 non-null    int64  
 14  registered  730 non-null    int64  
 15  cnt         730 non-null    int64  
dtypes: float64(4), int64(11), object(1)
memory usage: 91.4+ KB


In [6]:
bike_df_ref['season'] = bike_df_ref['season'].astype(str)
bike_df_ref['weathersit'] = bike_df_ref['weathersit'].astype(str)

In [7]:
bike_df_ref.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [16]:
df_train, df_test = train_test_split(bike_df_ref, test_size=0.3, random_state=100)
num_vars=['windspeed', 'cnt','temp', 'atemp', 'hum']
scalar= MinMaxScaler()
df_train[num_vars]=scalar.fit_transform(df_train[num_vars])

(511, 29)
(219, 29)


In [18]:
y_train=df_train.pop('cnt')
X_train=df_train
lm= LinearRegression()
ref= RFE(lm, step=25)
ref.fit(X_train, y_train)

In [19]:
list(zip(X_train.columns, ref.support_, ref.ranking_))

[('yr', True, 1),
 ('holiday', True, 1),
 ('temp', True, 1),
 ('atemp', True, 1),
 ('hum', True, 1),
 ('windspeed', True, 1),
 ('spring', True, 1),
 ('summer', False, 2),
 ('winter', True, 1),
 ('light', True, 1),
 ('mist', True, 1),
 ('Aug', False, 2),
 ('Dec', True, 1),
 ('Feb', False, 2),
 ('Jan', True, 1),
 ('Jul', False, 2),
 ('Jun', False, 2),
 ('Mar', False, 2),
 ('May', False, 2),
 ('Nov', False, 2),
 ('Oct', False, 2),
 ('Sep', True, 1),
 ('Mon', True, 1),
 ('Sat', False, 2),
 ('Sun', False, 2),
 ('Thu', False, 2),
 ('Tue', False, 2),
 ('Wed', False, 2)]

In [20]:
col= X_train.columns[ref.support_]
col

Index(['yr', 'holiday', 'temp', 'atemp', 'hum', 'windspeed', 'spring',
       'winter', 'light', 'mist', 'Dec', 'Jan', 'Sep', 'Mon'],
      dtype='object')

In [21]:
X_train.columns[~ref.support_]

Index(['summer', 'Aug', 'Feb', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sat',
       'Sun', 'Thu', 'Tue', 'Wed'],
      dtype='object')

In [22]:
X_train_ref=df_train[col]
X_train_ref= sm.add_constant(X_train_ref)
lr = sm.OLS(y_train,X_train_ref)
lr_model = lr.fit()
lr_model.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.84
Model:,OLS,Adj. R-squared:,0.836
Method:,Least Squares,F-statistic:,186.3
Date:,"Sun, 10 Mar 2024",Prob (F-statistic):,3.61e-187
Time:,23:07:04,Log-Likelihood:,506.86
No. Observations:,511,AIC:,-983.7
Df Residuals:,496,BIC:,-920.2
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3650,0.032,11.520,0.000,0.303,0.427
yr,0.2338,0.008,28.554,0.000,0.218,0.250
holiday,-0.1044,0.026,-4.029,0.000,-0.155,-0.053
temp,0.2973,0.129,2.297,0.022,0.043,0.552
atemp,0.1072,0.137,0.785,0.433,-0.161,0.376
hum,-0.1225,0.038,-3.218,0.001,-0.197,-0.048
windspeed,-0.1701,0.026,-6.446,0.000,-0.222,-0.118
spring,-0.1015,0.015,-6.558,0.000,-0.132,-0.071
winter,0.0564,0.013,4.419,0.000,0.031,0.082

0,1,2,3
Omnibus:,74.034,Durbin-Watson:,2.068
Prob(Omnibus):,0.0,Jarque-Bera (JB):,199.874
Skew:,-0.715,Prob(JB):,3.96e-44
Kurtosis:,5.71,Cond. No.,75.9


In [23]:
vif=pd.DataFrame()
vif['Features']=X_train.columns
vif['VIF']= [variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by='VIF', ascending=False)
vif

Unnamed: 0,Features,VIF
2,temp,438.42
3,atemp,384.17
4,hum,38.24
6,spring,11.89
8,winter,11.86
7,summer,6.97
20,Oct,6.38
19,Nov,6.02
11,Aug,5.95
5,windspeed,5.36


In [24]:
df_test[num_vars]= scalar.transform(df_test[num_vars])
df_test.head()
y_test=df_test['cnt']
X_test=df_test[col]
X_test_sm= sm.add_constant(X_test)
y_test_pred= lr_model.predict(X_test_sm)
r2_score(y_true=y_test, y_pred=y_test_pred)

0.8156120322982177

In [25]:
col=col.drop('temp')
X_train_ref=df_train[col]
X_train_ref= sm.add_constant(X_train_ref)
lr = sm.OLS(y_train,X_train_ref)
lr_model = lr.fit()
lr_model.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.839
Model:,OLS,Adj. R-squared:,0.834
Method:,Least Squares,F-statistic:,198.5
Date:,"Sun, 10 Mar 2024",Prob (F-statistic):,3.42e-187
Time:,23:07:04,Log-Likelihood:,504.15
No. Observations:,511,AIC:,-980.3
Df Residuals:,497,BIC:,-921.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3688,0.032,11.607,0.000,0.306,0.431
yr,0.2344,0.008,28.524,0.000,0.218,0.251
holiday,-0.1023,0.026,-3.934,0.000,-0.153,-0.051
atemp,0.4122,0.032,12.816,0.000,0.349,0.475
hum,-0.1238,0.038,-3.238,0.001,-0.199,-0.049
windspeed,-0.1617,0.026,-6.162,0.000,-0.213,-0.110
spring,-0.1064,0.015,-6.912,0.000,-0.137,-0.076
winter,0.0506,0.013,4.028,0.000,0.026,0.075
light,-0.2598,0.027,-9.670,0.000,-0.313,-0.207

0,1,2,3
Omnibus:,73.148,Durbin-Watson:,2.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,204.168
Skew:,-0.695,Prob(JB):,4.63e-45
Kurtosis:,5.767,Cond. No.,18.5


In [26]:
vif=pd.DataFrame()
X_train =X_train[col]
vif['Features']=X_train.columns
vif['VIF']= [variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by='VIF', ascending=False)
vif

Unnamed: 0,Features,VIF
3,hum,24.13
2,atemp,14.11
4,windspeed,3.95
5,spring,2.88
8,mist,2.28
6,winter,2.16
0,yr,2.06
10,Jan,1.72
9,Dec,1.34
7,light,1.24


In [27]:
col=col.drop('hum')
X_train_ref=df_train[col]
X_train_ref= sm.add_constant(X_train_ref)
lr = sm.OLS(y_train,X_train_ref)
lr_model = lr.fit()
lr_model.summary()

0,1,2,3
Dep. Variable:,cnt,R-squared:,0.835
Model:,OLS,Adj. R-squared:,0.831
Method:,Least Squares,F-statistic:,210.2
Date:,"Sun, 10 Mar 2024",Prob (F-statistic):,4.04e-186
Time:,23:07:05,Log-Likelihood:,498.82
No. Observations:,511,AIC:,-971.6
Df Residuals:,498,BIC:,-916.6
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3045,0.025,12.165,0.000,0.255,0.354
yr,0.2373,0.008,28.772,0.000,0.221,0.253
holiday,-0.1036,0.026,-3.947,0.000,-0.155,-0.052
atemp,0.3873,0.032,12.285,0.000,0.325,0.449
windspeed,-0.1406,0.026,-5.478,0.000,-0.191,-0.090
spring,-0.1084,0.016,-6.977,0.000,-0.139,-0.078
winter,0.0437,0.013,3.491,0.001,0.019,0.068
light,-0.2945,0.025,-11.834,0.000,-0.343,-0.246
mist,-0.0796,0.009,-9.067,0.000,-0.097,-0.062

0,1,2,3
Omnibus:,72.926,Durbin-Watson:,2.062
Prob(Omnibus):,0.0,Jarque-Bera (JB):,197.482
Skew:,-0.703,Prob(JB):,1.3100000000000001e-43
Kurtosis:,5.701,Cond. No.,14.6


In [28]:
vif=pd.DataFrame()
X_train =X_train[col]
vif['Features']=X_train.columns
vif['VIF']= [variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by='VIF', ascending=False)
vif

Unnamed: 0,Features,VIF
3,windspeed,3.88
2,atemp,3.8
4,spring,2.35
0,yr,2.06
9,Jan,1.64
5,winter,1.62
7,mist,1.52
8,Dec,1.29
11,Mon,1.18
10,Sep,1.16


In [29]:
y_test=df_test['cnt']
X_test=df_test[col]
X_test_sm= sm.add_constant(X_test)
y_test_pred= lr_model.predict(X_test_sm)
r2_score(y_true=y_test, y_pred=y_test_pred)

0.8089300545079784