In [None]:
#importing required libraries
import os
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from  matplotlib import pyplot
%matplotlib inline

In [None]:
#set working directory
os.chdir("C:/Users/Shriyan/Desktop")

In [None]:
data_train = pd.read_csv("day.csv")

In [None]:
data_train.head()

In [None]:
#check the dimension of the data
data_train.shape

In [None]:
#check the descriptive stats
data_train.describe()

In [None]:
#getting the information of the data
data_train.info()

In [None]:
#checking the data types
data_train.dtypes

# Exploratory Data Analysis

In [None]:
#converting the data types
data_train['season']= data_train['season'].astype('category')
data_train['yr']=data_train['yr'].astype('category')
data_train['mnth']=data_train['mnth'].astype('category')
data_train['holiday']=data_train['holiday'].astype('category')
data_train['workingday']=data_train['workingday'].astype('category')
data_train['weekday']=data_train['weekday'].astype('category')
data_train['weathersit']=data_train['weathersit'].astype('category')

#converting the dteday to date time
data_train["dteday"] = pd.to_datetime(data_train["dteday"],format= "%Y-%m-%d")

In [None]:
data_train.dtypes

In [None]:
#Missing value analysis
data_train.isnull().sum()  # there are no missing values

In [None]:
#check the distribution of count
sns.distplot(data_train['cnt'])

In [None]:
#Check the distribution of numerical data using histogram
sns.distplot(data_train['atemp'])

In [None]:
sns.distplot(data_train['temp'])

In [None]:
sns.distplot(data_train['hum'])

In [None]:
sns.distplot(data_train['windspeed'])

# Outlier Analysis

In [None]:
#Outlier Analysis
sns.boxplot(data=data_train[['atemp','temp','windspeed','hum']])
fig=plt.gcf()
fig.set_size_inches(8,8)

In [None]:
#Remove outliers in Humidity
q75, q25 = np.percentile(data_train['hum'], [75 ,25])
print(q75,q25)
iqr = q75 - q25
print(iqr)
min = q25 - (iqr*1.5)
max = q75 + (iqr*1.5)
print(min)
print(max)

data_train = data_train.drop(data_train[data_train.iloc[:,11] < min].index)
data_train = data_train.drop(data_train[data_train.iloc[:,11] > max].index)

In [None]:
#Remove outliers in Windspeed
q75, q25 = np.percentile(data_train['windspeed'], [75 ,25])
print(q75,q25)
iqr = q75 - q25
print(iqr)
min = q25 - (iqr*1.5)
max = q75 + (iqr*1.5)
print(min)
print(max)

data_train = data_train.drop(data_train[data_train.iloc[:,12] < min].index)
data_train = data_train.drop(data_train[data_train.iloc[:,12] > max].index)

# Correlation Analysis

In [None]:
#checking the correlation between the variables with scatter plots

#relation between temperature and cnt
sns.relplot(x = 'cnt', y = 'temp', data = data_train[:200])
plt.ylim(0, )

# the graph shows that there is high relation between count and temparature

In [None]:
#relation between atemp and cnt
sns.relplot( x = 'cnt', y = 'atemp', data = data_train[:200])
plt.ylim(0,)

# the graph shows that there is high relation between count and atemparature

In [None]:
#relation between humidity and cnt
sns.relplot(x = 'cnt', y = 'hum', data = data_train[:200])
plt.ylim(0,)

# the graph shows that there is relation between count and humidity

In [None]:
#relation between windspeed and cnt
sns.relplot(x = 'cnt', y = 'windspeed', data = data_train[:200])
plt.ylim(0,)

# the graph shows that there is very low relation between count and windspeed

In [None]:
#check the relation between count and weathersit
sns.barplot(x = 'weathersit', y = 'cnt', data = data_train[:600])

# the graph shows that there is high relation between count and weathersit. the sales are very high on 1 = clear, few clody day
#there was very low sales on 3 =light snow, light rain day and there are no sales on 4 = heavy rain, ice pallets.

In [None]:
#check the relation between count and weekday
sns.barplot(x = 'weekday', y = 'cnt', data = data_train[:200])

# the graph shows that sales are pretty good on all the days. the sales doesn't depend on the weekdays.

In [None]:
#check the relation between count and holiday
sns.barplot(x = 'holiday', y = 'cnt', data = data_train[:800])

# the graph shows that the more sales happened on non holiday.

In [None]:
#check the relation between count and holiday
sns.barplot(x = 'season', y = 'cnt', data = data_train[:400])

# 1 = spring, 2 = summer, 3 = fall, 4 = winter
#the graph shows that the season is highly reated to count.
#sales were good on 3 = fall.

In [None]:
#check the relation between count and workingday
sns.barplot(x = 'workingday', y = 'cnt', data = data_train[:800])

In [None]:
#check the relation between count and year
sns.barplot(x = 'yr', y = 'cnt', data = data_train[:400])

In [None]:
#Bike Rentals Monthly
sales_by_month = data_train.groupby('mnth').size()
print(sales_by_month)
#Plotting the Graph
plot_by_month = sales_by_month.plot(title='Monthly Sales',xticks=(1,2,3,4,5,6,7,8,9,10,11,12))
plot_by_month.set_xlabel('Months')
plot_by_month.set_ylabel('Total Bikes Rented')

# Feature Engineering

In [None]:
#checking the multicollinearity between the variables

#Feature selection on the basis of various features like correlation, multicollinearity.

#Correlation Plot
cnames = ["dteday","yr","mnth","workingday","weekday","weathersit","temp","atemp","hum","windspeed", 'season']
df_corr = data_train.loc[:,cnames]

#Set the width and hieght of the plot
f, ax = plt.subplots(figsize=(7, 5))

#Generate correlation matrix
corr = df_corr.corr()

#Plot using seaborn library
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

#the heatmap shows that there is multicollinearity between temp and atemp. so we can drop atemp variable

In [None]:
#remove the variables which are duplicate and do not give value to the model
data_train = data_train.drop(['instant', 'dteday', 'atemp', 'casual', 'registered'], axis = 1)

#casual and registered are the result of count, the sum of these two are the count. so we are dropping those two variable,
#because that's the target variable.

In [None]:
data_train.head()

# Model Development

In [None]:
#split data into train and test randomly
from sklearn.linear_model import LinearRegression
train, test = train_test_split(data_train, test_size = 0.3)

In [None]:
######     Linear Regression    #######

lm = LinearRegression()
lm.fit(train.iloc[:,0:10], train.iloc[:,10])
predictions_lm = lm.predict(test.iloc[:,0:10])

# Error = 11.29%
# Accuracy = 88.71%
# R Sq = 0.8116
# Corr = 0.9009

In [None]:
#Calculate MAPE

def MAPE(y_true, y_pred): 
    mape = np.mean(np.abs((y_true - y_pred) / y_true))*100
    return mape

#Calculate MAPE

MAPE(test.iloc[:,10], predictions_lm)

In [None]:
import math
print('R sq: ', lm.score(train.iloc[:,0:10], train.iloc[:,10]))
print('Correlation: ', math.sqrt(lm.score(train.iloc[:,0:10], train.iloc[:,10])))

In [None]:
##########    Decision tree    ########

fit_DT = DecisionTreeRegressor(max_depth=2).fit(train.iloc[:,0:10], train.iloc[:,10])
predictions_DT = fit_DT.predict(test.iloc[:,0:10])

# Error = 17.40%
# Accuracy = 82.60%

In [None]:
#Calculate MAPE

MAPE(test.iloc[:,10], predictions_DT)

In [None]:
#######    Random forest    #######

RFmodel = RandomForestRegressor(n_estimators = 200).fit(train.iloc[:,0:10], train.iloc[:,10])
RF_predictions = RFmodel.predict(test.iloc[:,0:10])

# Error = 11.16%
# Accuracy = 88.84%

In [None]:
#Calculate MAPE

MAPE(test.iloc[:,10], RF_predictions)

# Extracting Data after model evaluation

In [None]:
result=pd.DataFrame(test.iloc[:,0:11])
result['pred_cnt'] = (RF_predictions)

result.to_csv("Random forest output python.csv",index=False)