## This is the assignment on multiple linear regression to predict the sale of bikes based on the avaialble data.

## lets follow the steps to know what the final model.


## 1. Import Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

## 2. Reading, understanding, cleaning & visualizing the data

In [None]:
## Step 1 - Reading, Understanding & visualizing the data
df = pd.read_csv('day.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()
#There is no null values which is good

In [None]:
# remove the columns that are not requried, like-  casual/registred as as we only have to use cnt as output field.
#we don't require the instant field as it just index, from dteday we have already taken the year, month, day details out so we can drop those as well.
df.drop(['registered', 'casual', 'dteday', 'instant'], axis=1, inplace=True)

### Correcting the data with proper name

In [None]:
# update name of categorical value to proper names-

#Season
def map_seasons(df, column_name):
    season_mapping = {1:"spring", 2:"summer", 3:"fall", 4:"winter"}
    return df[column_name].map(season_mapping)

#Month
def map_month(df, column_name):
    month_mapping = {1:"Jan", 2:"Feb", 3:"Mar", 4:"Apr", 5:"May", 6:"Jun", 7:"Jul", 8:"Aug", 9:"Sep", 10:"Oct", 11:"Nov", 12:"Dec"}
    return df[column_name].map(month_mapping)

#Weekday
def map_weekday(df, column_name):
    weekday_mapping = {0:"Sunday", 1:"Monday", 2:"Tuesday", 3:"Wednesday", 4:"Thursday", 5:"Friday", 6:"Saturday"}
    return df[column_name].map(weekday_mapping)

#weathersit
def map_weathersit(df, column_name):
    weathersit_mapping = {1:"Clear", 2:"Mist", 3:"Snow", 4:"Rain"}
    return df[column_name].map(weathersit_mapping)

df['season'] = map_seasons(df, 'season')
df['mnth'] = map_month(df, 'mnth')
df['weekday'] = map_weekday(df, 'weekday')
df['weathersit'] = map_weathersit(df, 'weathersit')


df.head()

### EDA

In [None]:
#box plot for categorical columns - Season, month , weekday & weathersit   
def display_boxplot(df, x_column, y_column):
    sns.boxplot(x= df[x_column], y = df[y_column])
    plt.show()

In [None]:
display_boxplot(df, "season","cnt")
display_boxplot(df, "mnth","cnt")
display_boxplot(df, "weekday","cnt")
display_boxplot(df, "weathersit","cnt")
display_boxplot(df, "yr","cnt")
display_boxplot(df, "holiday","cnt")
display_boxplot(df, "workingday","cnt")

### Findings-
#### Spring has least sales followed by winter & Summer/falls have max sale of biles
#### Nov Dec Jan & Feb has comparatively low sales.
#### We have no sales during Rain & lowest when it Snows.
#### 2019 was great in term of sales.
#### During non holiday we have better sales.



In [None]:
# Visualize the numerical data
sns.pairplot(df[["temp","atemp","hum","windspeed","cnt"]])
plt.show()

### Findings-
#### temperature & atemp have positive inclination towards sale, more the temp more the sale.
#### humidity & windspeed are not impacting the sale based on this graph.

### Actions-
#### We can remove temp & keep only atemp as both are highly correlated, so keeping both will not help much.

### Creating Dummy Variables

In [None]:
# Now lets make these categorical as dummy variable -season,mnth,weekday,weathersit
#create season dummy variables for the model
seasons = pd.get_dummies(df.season , drop_first=True, dtype=int)
mnth = pd.get_dummies(df.mnth , drop_first=True, dtype=int)
weekday = pd.get_dummies(df.weekday , drop_first=True, dtype=int)
weathersit = pd.get_dummies(df.weathersit , drop_first=True, dtype=int)


seasons.head()

In [None]:
weekday.head()

In [None]:
weathersit.head()

In [None]:
#let concat these new fiels & remove the exising field-
df = pd.concat([df, seasons,mnth,weekday,weathersit] ,axis=1)
df.head()

In [None]:
# lets drop the fields we got dummy field from & the weekday feild
#for month I am not sure if requred will add later.
df.drop(['season', 'weathersit', 'weekday', 'mnth'], axis=1, inplace=True)
df.head()

## 3. Preparing the data for modelling (train- test split, rescaling)

In [None]:
# split the data in train & test
df_train, df_test = train_test_split(df,train_size=0.7,random_state=100)
print(df_train.shape)
print(df_test.shape)


In [None]:
# we have atemp whose value is different from others, so they need to scaled to match the other fields-
scaler = MinMaxScaler()
#apply scaler to all categorical columns- temp	atemp	hum	windspeed	cnt	
num_vars = ['temp','atemp', 'hum', 'windspeed','cnt']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()

In [None]:
df_train.describe()

### Training the model

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(df_train.corr(), annot=True, cmap="YlGnBu")
plt.show()

In [None]:
df_train.head(2)

In [None]:
# Diving into X & y set for model building
y_train = df_train.pop('cnt')
X_train = df_train


In [None]:
X_train.head()

In [None]:
# Building the model using stats model learn

## 4. Model Generation

### Model 1 with all the fields.

In [None]:
X_train_model1 = sm.add_constant(X_train)
lm_model1 = sm.OLS(y_train,X_train_model1).fit()
lm_model1.summary()

In [None]:
# In model 1 we have many fields having high p-values, Let reduce the count using RFE

### Model 2 - RFE


In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)

steps_to_select = 10
rfe = RFE(estimator=lm, step=steps_to_select)
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
#Dropping these which are rejected by RFE & making the model on top of others
X_train.columns[~rfe.support_]

In [None]:
X_train_model2 = X_train[col]

In [None]:
X_train_model2 = sm.add_constant(X_train_model2)

lm_model2 = sm.OLS(y_train,X_train_model2).fit()

lm_model2.summary()


### VIF

In [None]:
def calculate_VIF( X_train_new):
    vif = pd.DataFrame()
    X = X_train_new
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = 'VIF', ascending= False)
    display(vif)

In [None]:
#for calculating VIF lets remove the constant that is required for the model
X_train_new = X_train_model2.drop(['const'], axis=1)
calculate_VIF(X_train_new)

In [None]:
# To take decision on what variables to drop, lets now check with VIF


### Model 3 - remove the one having p-value  Saturday

In [None]:
X_train_model3 = X_train_model2.drop(['Saturday'], axis=1)

In [None]:
X_train_model3 = sm.add_constant(X_train_model3)

lm_model3 = sm.OLS(y_train,X_train_model3).fit()

lm_model3.summary()

In [None]:
#for calculating VIF lets remove the constant that is required for the model
X_train_new = X_train_model3.drop(['const'], axis=1)
calculate_VIF(X_train_new)

### Model 4 - Remove the highest VIF - Humidity

In [None]:
X_train_model4 = X_train_model3.drop(['hum'], axis=1)
X_train_model4 = sm.add_constant(X_train_model4)

lm_model4 = sm.OLS(y_train,X_train_model4).fit()

lm_model4.summary()

In [None]:
#for calculating VIF lets remove the constant that is required for the model
X_train_new = X_train_model4.drop(['const'], axis=1)
calculate_VIF(X_train_new)

### Model 5 - Remove December having high p-value

In [None]:
X_train_model5 = X_train_model4.drop(['Dec'], axis=1)
X_train_model5 = sm.add_constant(X_train_model5)

lm_model5 = sm.OLS(y_train,X_train_model5).fit()

lm_model5.summary()

In [None]:
X_train_new = X_train_model5.drop(['const'], axis=1)

calculate_VIF(X_train_new)

### Model 6 - Remove January having high p-value

In [None]:
X_train_model6 = X_train_model5.drop(['Jan'], axis=1)
X_train_model6 = sm.add_constant(X_train_model6)

lm_model6 = sm.OLS(y_train,X_train_model6).fit()

lm_model6.summary()

In [None]:
X_train_new = X_train_model6.drop(['const'], axis=1)

calculate_VIF(X_train_new)

## 5. Residual Analysis

In [None]:
y_train_pred = lm_model6.predict(X_train_model6)

In [None]:
res = y_train - y_train_pred
sns.distplot(res)

## 6. Prediction & evaluation of the test set

In [None]:
#apply scaler to all categorical columns- temp	atemp	hum	windspeed	cnt	
num_vars = ['temp','atemp', 'hum', 'windspeed','cnt']
df_test[num_vars] = scaler.transform(df_test[num_vars])
df_test.head()


In [None]:
# Diving into X & y set for model building
y_test = df_test.pop('cnt')
X_test = df_test

In [None]:
X_test_new = X_test[X_train_model6.drop(['const'], axis=1).columns]

In [None]:
X_test_new.head()

In [None]:
# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)
# Making predictions
y_test_pred = lm_model6.predict(X_test_new)

In [None]:
res = y_test - y_test_pred
sns.distplot(res)

## 7. Model Evaluation

In [None]:

# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_test_pred)
fig.suptitle('y_test vs y_test_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_test_pred', fontsize=16)                          # Y-label

## 8. R2 Score

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_test_pred)