In [None]:
print("Bike Assignment")
import warnings
warnings.filterwarnings('ignore')

Importing required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.metrics import r2_score

In [None]:
#   Read csv data
df = pd.read_csv("day.csv")
print(df.head())

Finding shape, size and details of data set

In [None]:
print(df.shape)
print(df.info())


In [None]:
df.isnull().sum()

In [None]:
print(df.describe())


Renaming columns for better readability

In [None]:
df.drop(['instant'],axis=1,inplace=True)

In [None]:
df.drop(['dteday'],axis=1,inplace=True)

In [None]:
df.drop(['casual'],axis=1,inplace=True)
df.drop(['registered'],axis=1,inplace=True)

In [None]:
df.head()
df.info()

In [None]:
df.describe()

In [None]:
df[['temp','atemp','hum','windspeed','cnt']].corr()

In [None]:
# Drop the column "atemp" from the DataFrame 'df'
df.drop(columns=["atemp"], inplace=True)
df.info()

In [None]:
df['season'] = df['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'})
df['mnth'] = df['mnth'].map({1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'})
df['weekday'] = df['weekday'].map({0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'})
df['weathersit'] = df['weathersit'].map({
    1: 'Clear',
    2: 'Cloudy_mist',
    3: 'Light_Rain_Thunder',
    4: 'Heavy_Rain_thunder'
})

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# visualizing variables
cat_vars = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

plt.figure(figsize=(15, 12))
for i, cat_var in enumerate(cat_vars, 1):
    plt.subplot(3, 3, i)
    df_boxplot = df[['cnt', cat_var]].copy()
    df_boxplot[cat_var] = df[cat_var].astype('category')
    
    sns.boxplot(x=cat_var, y='cnt', data=df_boxplot, width=0.4)
    plt.title(f'Boxplot of Bike Rentals by {cat_var}')
    plt.xlabel(cat_var)
    plt.ylabel('Count of Bike Rentals')
    plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

Checking outlier In Spring as we saw in boxplot. Median is around 2000 and 7000> seems like an outlier so we can drop this one row.


In [None]:
# Select rows from DataFrame 'df' where the season is "Spring" and cnt is greater than 7000
df_filtered = df[(df['season'] == 'Spring') & (df['cnt'] > 7000)]
df.shape

In [None]:
# Dropping cnt > 7000 for spring
df.drop(df[(df.season == "Spring") & (df.cnt > 7000)].index,inplace=True)
# Dropped one row where season was spring and cnt was greater > 7000
df.shape

Understanding Correlation among categorical and numerical values


In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Checking correlation
sns.heatmap(df.corr(),cmap="YlGnBu",annot=True)

In [None]:
sns.pairplot(df,vars=["temp","hum","windspeed","cnt"])

In [None]:
# creating idummy variables
season_dum = pd.get_dummies(df['season'],drop_first=True)
mnth_dum = pd.get_dummies(df['mnth'],drop_first=True)
weekday_dum = pd.get_dummies(df['weekday'],drop_first=True)
weathersit_dum = pd.get_dummies(df['weathersit'],drop_first=True)

In [None]:
# Adding indicator variable columns to the dataset . Dropping original columns
df = pd.concat([df,season_dum,mnth_dum,weekday_dum,weathersit_dum],axis=1)
df.drop(columns=['season','mnth','weekday','weathersit'],inplace=True)

In [None]:
df.shape
df.head()


In [None]:
bool_columns = df.select_dtypes(include=['bool']).columns

# Convert boolean columns to uint8
df[bool_columns] = df[bool_columns].astype('uint8')
df.head()
df.info()

In [None]:
df.head()

In [None]:
# Import the train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# We specify this so that the train and test data set always have the same rows, respectively
np.random.seed(30)
df_train, df_test = train_test_split(df, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
df_test.shape
df_train.shape

In [None]:
# Create an instance of the MinMaxScaler class
scaler = MinMaxScaler()

In [None]:
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables

num_vars = num_vars = ['temp','hum','windspeed']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
# Let's check the correlation coefficients to see which variables are highly correlated

plt.figure(figsize = (40, 30))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
# Separating the target variable "cnt" from the features in the training data
y_train = df_train.pop("cnt")
x_train = df_train

Model 1

In [None]:
# Import the statsmodels library
import statsmodels.api as sm

# Add a constant column to the DataFrame 'x_train' containing only the "temp" feature
x_train_lm = sm.add_constant(x_train[["temp"]])

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
lr = sm.OLS(y_train, x_train_lm).fit()

In [None]:
lr.params


In [None]:
print(lr.summary())

In [None]:
x_train.columns

Model 2: Multiiple linear regression model using all variables


In [None]:
# Add a constant column to the DataFrame 'x_train' using statsmodels
x_train_lm = sm.add_constant(x_train)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_2 = sm.OLS(y_train, x_train_lm).fit()
mlr_2.params

In [None]:
print(mlr_2.summary())

Checking vIF

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Running RFE with the output number of variables equal to 10
lm = LinearRegression()
lm.fit(x_train, y_train)

# Assuming X_train is your feature matrix
rfe = RFE(lm,n_features_to_select=20)              
rfe = rfe.fit(x_train, y_train)

In [None]:
rfe_table=pd.DataFrame(list(zip(x_train.columns,rfe.support_,rfe.ranking_)),columns=["Feature","support","Rank"])
rfe_table.sort_values(by="Rank",ascending=True)

In [None]:
# selecting columns based of RFE
selected_features= x_train.columns[rfe.support_]
selected_features


In [None]:
#Selecting selected feature for model creation
x_train_rfe = x_train[selected_features]

In [None]:
cols = ['temp', 'atemp', 'hum', 'windspeed']
plt.figure(figsize=(18,4))

i = 1
for col in cols:
    plt.subplot(1,4,i)
    sns.boxplot(y=col, data=df)
    i+=1

In [None]:
x_train_rfe.head()

Model 3: using RFE
Adding Selected variables to the model

In [None]:
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_3 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_3.summary())

Checking VIF

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 4:
Removing the variable hum based on its Very High 'VIF' va

In [None]:
# Drop the column "hum" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["hum"], axis=1, inplace=True)

In [None]:
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_4 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_4.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 5

In [None]:
# Drop the column "workingday" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["workingday"], axis=1, inplace=True)

In [None]:
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_5 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_5.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 6:
Removing the variable Summer based on its Very High 'p' value.



In [None]:
# Drop the column "Summer" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["Summer"], axis=1, inplace=True)
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_6 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_6.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 7:
Removing the variable Monday based on its Very High 'p' value.



In [None]:
# Drop the column "Monday" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["Monday"], axis=1, inplace=True)
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_7 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_7.summary())


In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 8:
Removing the variable Saturday based on its Very High 'p' value.

In [None]:
# Drop the column "Saturday" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["Saturday"], axis=1, inplace=True)
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_8 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_8.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 9:
Removing the variable Sunday based on its Very High 'p' value.

In [None]:
# Drop the column "Sunday" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["Sunday"], axis=1, inplace=True)
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_9 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_9.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 10:
Removing the variable May and February based on its Very High 'p' value.



In [None]:
# Drop the columns "May" and "February" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["May", "February"], axis=1, inplace=True)
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_10 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_10.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Model 11:
Removing the variable January based on its Very High 'p' value.



In [None]:
# Drop the columns "January" from the DataFrame 'x_train_rfe'
x_train_rfe.drop(["January"], axis=1, inplace=True)
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_lm = sm.add_constant(x_train_rfe)

# Fit Ordinary Least Squares (OLS) regression model using statsmodels
mlr_11 = sm.OLS(y_train, x_train_lm).fit()

# Print the summary of the regression model
print(mlr_11.summary())


Step 7: Residual Analysis of the train data

In [None]:
# Add a constant column to the DataFrame 'x_train_rfe' using statsmodels
x_train_sm = sm.add_constant(x_train_rfe)
# Predict the target variable 'y_train' using the trained regression model 'mlr_11' and the DataFrame 'x_train_sm'
y_train_pred = mlr_11.predict(x_train_sm)
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_pred), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)

Step 8: Making Predictions Using the Final Model

In [None]:
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = num_vars = ['temp','hum','windspeed']

df_test[num_vars] = scaler.fit_transform(df_test[num_vars])
df_test.describe()

In [None]:
y_test = df_test.pop('cnt')
x_test = df_test
# Adding constant variable to test dataframe
x_test_mlr_11 = sm.add_constant(x_test)
x_train_rfe.head()

In [None]:

# checking X_test_mlr_10
x_test_mlr_11.head()
x_test_mlr_11.info()
x_test_mlr_11.shape


In [None]:
# Adding constant variable to test dataframe
x_test_mlr_11 = sm.add_constant(x_test)
# # Making predictions using the fourth model


In [None]:
x_train_rfe.head()

In [None]:
x_test_mlr_11_new= x_test_mlr_11[['yr', 'temp', 'windspeed', 'Spring', 'Winter','December','July',"November",'September',
                            'Cloudy_mist', 'Light_Rain_Thunder']]

In [None]:
# checking X_test_mlr_10
x_test_mlr_11.head()
x_test_mlr_11_new.shape

In [None]:
# Adding constant 
x_test_mlr_11 = sm.add_constant(x_test_mlr_11)

In [None]:
x_test_mlr_11.info()


In [None]:
# Making predictions using the fourth model

y_test_pred_mlr_11 = mlr_11.predict(x_test_mlr_11)


Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread

fig = plt.figure()
plt.scatter(y_test, y_test_pred_mlr_11)
fig.suptitle('y_test vs y_test_pred_mlr_11', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred_mlr_11', fontsize = 16)

In [None]:
# Plotting Actual vs Predicted 
fig,ax = plt.subplots()
fig.set_figheight(8)
fig.set_figwidth(20)
l1,=ax.plot(range(len(y_test)),y_test)
l2, = ax.plot(range(len(y_test_pred_mlr_11)),y_test_pred_mlr_11)
plt.legend([l1,l2],['Actual','Predicted'])
plt.title('Predicted vs Actual No of CNT');
plt.ylabel('No of Bike Rentals')
plt.xticks([])
plt.show()

In [None]:
# Visualizing the fit on the test data
# plotting a Regression plot

plt.figure()
sns.regplot(x=y_test, y=y_test_pred_mlr_11, ci=68, fit_reg=True,scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('y_test vs y_test_pred_mlr_11', fontsize=20)
plt.xlabel('y_test', fontsize=18)
plt.ylabel('y_pred', fontsize=16)
plt.show()

In [None]:
# CHECKING R SQUARED VALUE FOR TRAIN AND TEST DATA # CHECKING R SQUARED VALUE FOR TRAIN AND TEST DATA 
from sklearn.metrics import mean_squared_error,r2_score

mse = np.sqrt(mean_squared_error(y_test, y_test_pred_mlr_11))
rsquared_test = r2_score(y_test, y_test_pred_mlr_11)
rsquared_train = r2_score(y_train, y_train_pred)
print('R-squared for train data:',rsquared_train)
print('R-squared for test data:',rsquared_test)
print('Mean Squared Error',round(mse,3))

mse = np.sqrt(mean_squared_error(y_test, y_test_pred_mlr_11))
rsquared_test = r2_score(y_test, y_test_pred_mlr_11)
rsquared_train = r2_score(y_train, y_train_pred)
print('R-squared for train data:',rsquared_train)
print('R-squared for test data:',rsquared_test)
print('Mean Squared Error',round(mse,3))


R-squared for train data: 0.8421579512778632
R-squared for test data: 0.7912052419702504
Mean Squared Error 844.87


In [None]:
#Calculate the r square for test

r_squared = r2_score(y_test, y_test_pred_mlr_11)
r_squared

Value comes out to 0.7912052419702504

In [None]:
mlr_10.params.to_frame()