In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

### Reading, Understanding and Quality Check of the data

In [None]:
df = pd.read_csv('./day.csv')

In [None]:
df.head()

In [None]:
df.info()

- There is no missing values from the above

In [None]:
# Checking for the missing values

df.isnull().sum()

In [None]:
# Check the statistical desciption of numerical columns of dataframe

df.describe()

#### Visualizing Data with Continuous Variables

In [None]:
df = df.drop(columns=['instant','dteday','casual','registered'])

These variables were removed because they didn’t add unique insights for the analysis:

- Instant: It’s simply a serial number, so it doesn't contribute meaningful information.
- Dteday: This variable is redundant as month and year (mnth, yr) already capture the necessary date details.
- Casual and Registered: These variables were combined into cnt, which represents the total count, making them unnecessary individually.

In [None]:
# summary statistics of numerical variables
df[['temp','atemp','hum','windspeed']].describe()

In [None]:
# Scatter Plots of Continuous variables vs 'cnt'
sns.set_style("whitegrid")
sns.pairplot(data=df,x_vars=['temp','atemp','hum','windspeed'],y_vars='cnt',kind='scatter',height=5,aspect=1);

Analyzing Outliers in Continuous Variables Relative to cnt

In [None]:

# Dropping outliers in temp
df = df.drop(index = df[(df['temp'] > 15) & (df['temp'] < 20) & (df['cnt'] < 100)].index)
df = df.drop(index = df[(df['temp'] > 25) & (df['temp'] < 30) & (df['cnt'] < 2000)].index)


# Dropping outliers in atemp
df = df.drop(index = df[(df['atemp'] > 20) & (df['atemp'] < 25) & (df['cnt'] < 100)].index)
df = df.drop(index = df[(df['atemp'] > 30) & (df['atemp'] < 35) & (df['cnt'] < 2000)].index)


# Dropping outliers in hum
df = df.drop(index = df[(df['hum'] < 20)].index)

# Dropping outliers in windspeed
df = df.drop(index = df[(df['windspeed'] > 30)].index)

In [None]:
correlation = df[['temp','atemp','hum','windspeed','cnt']].corr()['cnt'].apply(lambda x : round(x,4))
correlation = pd.DataFrame(correlation).sort_values(by='cnt',ascending=False)
correlation.drop(index=['cnt'],inplace=True)
# dropping registered,casual, instant
correlation.style.background_gradient(cmap='GnBu')

In [None]:
# correlation between temp and atemp
df[['temp','atemp']].corr()

In [None]:
df = df.drop(columns=['atemp'])

In [None]:
df[['temp','hum','windspeed']].corr()

- Since atemp does not show a significant correlation with hum or windspeed, these variables will not be removed from the dataset at this time.

### Exploring Visual Representations of Categorical Variables

In [None]:
# Changing Variables to Categorical Data Type
df[['season','weathersit','mnth']] = df[['season','weathersit','mnth']].astype('category')

In [None]:
# Check for disguised missing values]
cat_vars = ['season','yr','mnth','holiday','weekday','workingday','weathersit']
for i in cat_vars : 
    print('Unique values in ',i, df[i].unique())

In [None]:
# Replacing numbers with labels 
season_labels = {
    1 : 'spring',
    2 : 'summer',
    3 : 'fall',
    4 : 'winter'
}

mnth_labels = {
    1 : 'january',
    2 : 'february',
    3 : 'march',
    4 : 'april',
    5 : 'may',
    6 : 'june',
    7 : 'july',
    8 : 'august',
    9 : 'september',
    10 : 'october',
    11 : 'november',
    12 : 'december'
}

weekday_labels = { # considering the first row of dteday to be 01-01-2011
    0 : 'Sunday',
    1 : 'Monday',
    2 : 'Tuesday',
    3 : 'Wednesday',
    4 : 'Thursday',
    5 : 'Friday',
    6 : 'Saturday'
}

weathersit_labels = {
    1 : 'clear',
    2 : 'cloudy',
    3 : 'light snow/rain'
}

# replacing numerals with labels 
df['season'] = df['season'].replace(season_labels)
df['mnth'] = df['mnth'].replace(mnth_labels)
df['weekday'] = df['weekday'].replace(weekday_labels)
df['weathersit'] = df['weathersit'].replace(weathersit_labels)

df.head()

In [None]:
cat_vars = ['season','yr','mnth','holiday','weekday',  'workingday','weathersit']
df1 = df[cat_vars]
df1.loc[:,'cnt'] = df['cnt'].values
df1[['yr','holiday','workingday']] = df1[['yr','holiday','workingday']].astype('category')
plot_dim = [3,3]
fig,axs = plt.subplots(*plot_dim)
fig.set_figheight(15)
fig.set_figwidth(20)
for i in range(plot_dim[0]) :  
    for j in range(plot_dim[1]) :
        axs[i,j].set(title = i*plot_dim[1]+j)
        sns.boxplot(data=df1,x='cnt',y=cat_vars[i*plot_dim[1]+j],width=0.4,ax=axs[i,j])
        if i*plot_dim[1]+j == 6 : 
            break
axs[2,1].set_axis_off()
axs[2,2].set_axis_off()

Seasonal Trends:

- Fall emerges as the peak season for rentals, surpassing even the summer months.
- September stands out as the month with the highest rental volume, closely followed by its neighboring months.

Year-over-Year Comparison:

- 2019 witnessed a substantial increase in rentals compared to 2018, with a median growth of approximately 2000 units.

Weekday vs. Holiday Rentals:

- Weekdays generally exhibit more consistent rental patterns, with relatively stable demand throughout the week.
- Holidays tend to have lower overall rental counts, but with greater fluctuations in demand. Thursdays and Sundays show slightly higher variability in rental numbers compared to other weekdays.

Overall Trend:

- The observed trends suggest a significant influence of seasonal factors on rental demand, particularly the impact of fall and September.
- Additionally, the analysis highlights the importance of weekdays and holidays in shaping rental patterns.

In [None]:

# Drop rows where season is 'spring' and cnt is greater than 7000
df = df.drop(index=df[(df['season'] == 'spring') & (df['cnt'] > 7000)].index)


In [None]:
# Select only the numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Plot the heatmap with only numeric columns
plt.figure(figsize=[10,10])
sns.heatmap(numeric_df.corr(), cmap='GnBu', center=0, annot=True)
plt.show()


Preparing the Data for Analysis

Creating Binary Variables

In [None]:
# creating indicator variable columns
season_indicators = pd.get_dummies(df['season'],drop_first=True)
mnth_indicators = pd.get_dummies(df['mnth'],drop_first=True)
weekday_indicators = pd.get_dummies(df['weekday'],drop_first=True)
weathersit_indicators = pd.get_dummies(df['weathersit'],drop_first=True)

In [None]:

df = pd.concat([df,season_indicators,mnth_indicators,weekday_indicators,weathersit_indicators],axis=1)
df = df.drop(columns=['season','mnth','weekday','weathersit'])

In [None]:
df.head()

In [None]:
df.columns

Dividing Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
dtrain,dtest = train_test_split(df,train_size=0.7,test_size=0.3,random_state=120)

Standardizing Numerical Features

In [None]:
# normalization of continuous variables
from sklearn.preprocessing import MinMaxScaler 
numerical_scaler = MinMaxScaler()
num_vars = ['temp','hum','windspeed']

numerical_scaler.fit(dtrain[num_vars])
dtrain[num_vars] = numerical_scaler.transform(dtrain[num_vars])

X_train , y_train

In [None]:
y_train = dtrain.pop('cnt')
X_train = dtrain

In [None]:
y_train.head()

In [None]:
X_train.head()

In [None]:
X_train.columns


In [None]:
# Selecting 15 Features using RFE 

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lr_estimator = LinearRegression()
rfe = RFE(lr_estimator,n_features_to_select=15, step=1)
selector = rfe.fit(X_train,y_train)

In [None]:
# RFE Feature Ranking
rfe_ranking = pd.DataFrame({'rank' : selector.ranking_, 'support': selector.support_, 'features' : X_train.columns}).sort_values(by='rank',ascending=True)
rfe_ranking

In [None]:
# Selected Features
selected_features = rfe_ranking.loc[rfe_ranking['rank'] == 1,'features'].values
selected_features

In [None]:
# Following a stepwise elimination
import statsmodels.api as sm
def ols_fit(y,X) : 
    X_train_sm = sm.add_constant(X)
    model = sm.OLS(y,X_train_sm).fit()
    print(model.summary())
    return model
def vif(X) : 
    df = sm.add_constant(X)
    vif = [variance_inflation_factor(df.values,i) for i in range(df.shape[1])]
    vif_frame = pd.DataFrame({'vif' : vif[0:]},index = df.columns).reset_index()
    print(vif_frame.sort_values(by='vif',ascending=False))


Model 
 * Using features selected by RFE : 'yr', 'Sunday', 'Saturday', 'november', 'january', 'december',
 'winter', 'july', 'spring', 'holiday', 'workingday', 'hum', 'temp',
 'windspeed', 'light snow/rain'

In [None]:
features_1 = selected_features
ols_fit(y_train,X_train[features_1])

In [None]:
del_feature = 'holiday'
selected_features = selected_features[selected_features!=del_feature]
ols_fit(y_train,X_train[selected_features])

In [None]:
del_feature = 'Sunday'
selected_features = selected_features[selected_features!=del_feature]
ols_fit(y_train,X_train[selected_features])

In [None]:
del_feature = 'january'
selected_features = selected_features[selected_features!=del_feature]
ols_fit(y_train,X_train[selected_features])

In [None]:
del_feature = 'december'
selected_features = selected_features[selected_features!=del_feature]
ols_fit(y_train,X_train[selected_features])

In [None]:
del_feature = 'november'
selected_features = selected_features[selected_features!=del_feature]
final_model = ols_fit(y_train,X_train[selected_features])

In [None]:
vif(X_train[selected_features])

In [None]:
final_model = ols_fit(y_train,X_train[selected_features])

In [None]:
# Residual Analysis of Trained Data
X_train_sm = sm.add_constant(X_train[selected_features])

y_train_pred = final_model.predict(X_train_sm)
fig,ax = plt.subplots(1,2)
fig.set_figheight(8)
fig.set_figwidth(16)

ax[0].set(title='Frequency Distribution of Residuals')
sns.distplot(y_train-y_train_pred, bins=30, ax=ax[0])

ax[1].set(title='Predicted Values vs Residuals')
\
sns.regplot(y_train_pred,y_train-y_train_pred,ax=ax[1])
plt.show()


In [None]:
# Mean of Residuals
(y_train-y_train_pred).mean()

In [None]:
# Verifying the normality of distribution of residuals 
mean = (y_train-y_train_pred).mean()
std = (y_train-y_train_pred).std()

ref_normal = np.random.normal(mean,std,(y_train-y_train_pred).shape[0])


percs = np.linspace(0,100,21)
qn_ref_normal = np.percentile(ref_normal, percs)
qn_residual = np.percentile(y_train - y_train_pred , percs)

plt.plot(qn_ref_normal,qn_residual, ls="", marker="o")

x = np.linspace(np.min((qn_ref_normal.min(),qn_residual.min())), np.max((qn_ref_normal.max(),qn_residual.max())))
plt.plot(x,x, color="k", ls="--")
plt.title('Q-Q Plot : Reference Normal vs Distribution of Residuals ')
plt.show()


Conclusion:-

Based on the analysis, the following factors have a significant impact on shared bike demand:

- Seasonality: Seasonal variations, particularly the preference for bike rentals during warmer months, play a crucial role.
- Weather Conditions: Adverse weather conditions, such as heavy rain, snow, or strong winds, can significantly reduce demand.
- Holidays and Weekends: Holidays and weekends often see increased demand as people engage in leisure activities and recreational rides.
- Specific Months: Months like September, November, and December, potentially due to seasonal factors or specific events, exhibit higher rental demand.
- Day of the Week: Sundays tend to have higher demand compared to other weekdays.
- Temperature and Humidity: Optimal temperature and humidity levels can positively influence bike rentals.

By understanding these factors, bike-sharing companies can make informed decisions regarding bike deployment, pricing strategies, and marketing campaigns to optimize their operations and meet customer demand effectively.