In [None]:
#SECTION-1 :: EDA - Exploratory data analysis
#Importing required libraries:
import pandas as pd;
import sklearn;
import statsmodels.api as sm;
import matplotlib.pyplot as plt;
import seaborn as sns;
from statsmodels.stats.outliers_influence import variance_inflation_factor;
from sklearn.model_selection import train_test_split;
from sklearn.preprocessing import MinMaxScaler;
from sklearn.metrics import r2_score;

data = pd.read_csv('/content/day.csv');
#Standardizing the date to a common format
data['dteday'] = data['dteday'].str.replace("/","-");
#Basis business/domain understanding few columns which are not needed for analysis can be dropped
columns_to_drop = ['instant', 'dteday', 'mnth', 'weekday'];
data = data.drop(columns=columns_to_drop);
#Mapping numerical labels into categorical string values for 'season' & 'weathersit'
replacement_map_season = {1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'};
replacement_map_weathersit = {1: 'clear', 2: 'cloudy', 3: 'light_rain'};
data['season'] = data['season'].replace(replacement_map_season);
data['weathersit'] = data['weathersit'].replace(replacement_map_weathersit);

#SECTION-2 :: Dummy variable creation for 'season' and 'weathersit' dropping first redundant dummy
status_season = pd.get_dummies(data['season'], drop_first = True);
status_weathersit = pd.get_dummies(data['weathersit'], drop_first = True);
#Concat the dummy DF with the original one:
data = pd.concat([data, status_season, status_weathersit], axis=1);
#Dropping those categorical variables for which dummies are created
data = data.drop('season',axis=1);
data = data.drop('weathersit',axis=1);

#SECTION-3: Train-Test data split:
#Splitted in the ratio of 70:30 i.e 70 % dataset is used to train the model
df_train,df_test = train_test_split(data, train_size=0.7, random_state=100);

#SECTION-4: Feature scaling [Min-Max scaling as it can take care of the outliers if any]:
#Instantiate the object
scaler = MinMaxScaler();
#Fit on the data [numeric variables only] by creating the list of numeric variables:
num_vars = ['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered' , 'cnt'];
df_train[num_vars] = scaler.fit_transform(df_train[num_vars]);

#SECTION-5: Model building i.e Training the model
#Build the model with all the variables:
y_train = df_train.pop('cnt');
X_train = df_train;
X_train_sm = sm.add_constant(X_train);
lr = sm.OLS(y_train, X_train_sm);
lr_model = lr.fit();
print(lr_model.summary());
#Create a dataframe that will contain the features along with their respective VIFs:
vif = pd.DataFrame();
vif['Features'] = X_train.columns;
vif['VIF'] = [variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);
#Ideal situation is p-value < 0.05 and VIF < 5:
# --SET-1 Variables that can be dropped because of high p-value & high VIF ::
#      temp, atemp, hum, workingday, yr
# --SET-2 Variables that can be dropped because of high p-value & low VIF ::
#      spring, summer, winter, cloudy, light_rain
# --SET-3 Variables that can be dropped because of low p-value & high VIF ::
#      windspeed, casual, registered
# --SET-4 Variables that is to be kept because of low p-value & low VIF ::
#      holiday

#Dropping variables one by one from SET-1 iteratively from each observations:
# dropping 'temp' as it has high p-value and high VIF
X1 = X_train.drop('temp', axis=1);
X1_train_sm = sm.add_constant(X1);
lr = sm.OLS(y_train, X1_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X1.columns;
vif['VIF'] = [variance_inflation_factor(X1.values,i) for i in range(X1.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'workingday' as it has high p-value & high VIF from previous iteration
X2 = X1.drop('workingday', axis=1);
X2_train_sm = sm.add_constant(X2);
lr = sm.OLS(y_train, X2_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X2.columns;
vif['VIF'] = [variance_inflation_factor(X2.values,i) for i in range(X2.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'yr' as it has high p-value & high VIF from previous iteration
X3 = X2.drop('yr', axis=1);
X3_train_sm = sm.add_constant(X3);
lr = sm.OLS(y_train, X3_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X3.columns;
vif['VIF'] = [variance_inflation_factor(X3.values,i) for i in range(X3.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'atemp' as it has high p-value & high VIF from previous iteration
X4 = X3.drop('atemp', axis=1);
X4_train_sm = sm.add_constant(X4);
lr = sm.OLS(y_train, X4_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X4.columns;
vif['VIF'] = [variance_inflation_factor(X4.values,i) for i in range(X4.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'windspeed' as it has high p-value & low VIF from previous iteration
X5 = X4.drop('windspeed', axis=1);
X5_train_sm = sm.add_constant(X5);
lr = sm.OLS(y_train, X5_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X5.columns;
vif['VIF'] = [variance_inflation_factor(X5.values,i) for i in range(X5.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'hum' as it has high p-value & high VIF from previous iteration
X6 = X5.drop('hum', axis=1);
X6_train_sm = sm.add_constant(X6);
lr = sm.OLS(y_train, X6_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X6.columns;
vif['VIF'] = [variance_inflation_factor(X6.values,i) for i in range(X6.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'winter' as it has high p-value & low VIF from previous iteration
X7 = X6.drop('winter', axis=1);
X7_train_sm = sm.add_constant(X7);
lr = sm.OLS(y_train, X7_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X7.columns;
vif['VIF'] = [variance_inflation_factor(X7.values,i) for i in range(X7.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

# dropping 'holiday' as it has high p-value & low VIF from previous iteration
X8 = X7.drop('holiday', axis=1);
X8_train_sm = sm.add_constant(X8);
lr = sm.OLS(y_train, X8_train_sm);
lr_model = lr.fit();
print(lr_model.summary());

vif = pd.DataFrame();
vif['Features'] = X8.columns;
vif['VIF'] = [variance_inflation_factor(X8.values,i) for i in range(X8.shape[1])];
vif['VIF'] = round(vif['VIF'] , 2);
vif = vif.sort_values(by = "VIF", ascending = False);
print(vif);

#Residual distribution curve:
y_train_pred = lr_model.predict(X8_train_sm);
print(y_train_pred);

res = y_train - y_train_pred;
sns.distplot(res);

#SECTION-6: Making prediction from the model on the test dataset:
# same list of numerical variables we captured earlier in num_vars & then fit on test data:
df_test[num_vars] = scaler.transform(df_test[num_vars]);
#print(df_test.head(3));
y_test = df_test.pop('cnt');
X_test = df_test;
X_test_sm = sm.add_constant(X_test);
X_test_sm = X_test_sm.drop(['temp','atemp','hum','winter','yr','holiday','workingday','windspeed'],axis=1);

#Make predictions:
y_test_pred = lr_model.predict(X_test_sm);
print(y_test_pred);

#SECTION-7: Evaluating the model
r2_test = r2_score(y_true = y_test , y_pred = y_test_pred );
print(r2_test);
r2_train = r2_score(y_true = y_train , y_pred = y_train_pred );
print(r2_train);
# value = 1 which matches with r_squared value of training dataset too which means model is fine

