# Program bike1_alan

Kaggle File:
https://www.kaggle.com/datasets/hmavrodiev/london-bike-sharing-dataset/data


## Setup

In [178]:
# Libraries and settings
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import linear_model

# Use this to turn warnings off (needed for the sklearn regression)
import warnings

# import statsmodels.api as sm
# import seaborn as sns
# import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = "{:,.2f}".format
cr="\n\n"

In [179]:

df= pd.read_csv('Data/london_merged.csv')
#df.head(25)

In [180]:
#df.describe()

In [181]:
# Count number of unique values
input_uniqvals = pd.DataFrame(df.nunique(axis=0),columns=["Unique Values"])
input_uniqvals

Unnamed: 0,Unique Values
timestamp,17414
cnt,3781
t1,73
t2,82
hum,143
wind_speed,103
weather_code,7
is_holiday,2
is_weekend,2
season,4


In [182]:
input_dtypes = pd.DataFrame(df.dtypes, columns=["Data Type"])
input_dtypes

Unnamed: 0,Data Type
timestamp,object
cnt,int64
t1,float64
t2,float64
hum,float64
wind_speed,float64
weather_code,float64
is_holiday,float64
is_weekend,float64
season,float64


## Data Cleansing & Underlying Variables

In [183]:
def extract_substring(value):
    return int(value[11:13])

df['hour'] = df['timestamp'].apply(extract_substring)
#df.head(100)

## Formation of Modelling Variables

### Group bike shares by Temperature and join back into the file the mean shares by group 

In [231]:
df['t1B'] = pd.qcut(df["t1"],q=20,precision=2).copy()
# out=df.groupby("t1B")["cnt"].agg(['count','mean','std','sem'])
out=df.groupby("t1B").agg(t1_mean=('t1', 'mean'),
                          count=('cnt','count'),
                          t1_cnt_mean=('cnt', 'mean'),                                                  
                          sem=('cnt','sem'))                               
#list(out)
out['lower']=np.subtract(out['t1_cnt_mean'],out['sem'])
out['upper']=np.add(out['t1_cnt_mean'],out['sem'])
out


Unnamed: 0_level_0,t1_mean,count,t1_cnt_mean,sem,lower,upper
t1B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-1.51, 4.0]",2.69,1216,611.24,24.04,587.2,635.28
"(4.0, 5.0]",4.89,608,676.61,31.75,644.86,708.35
"(5.0, 6.5]",6.03,932,808.02,28.66,779.36,836.69
"(6.5, 7.5]",7.13,925,832.92,28.68,804.24,861.6
"(7.5, 8.0]",8.0,714,826.06,30.73,795.33,856.79
"(8.0, 9.0]",8.87,1078,852.62,25.97,826.65,878.59
"(9.0, 10.0]",9.87,1014,913.06,29.95,883.12,943.01
"(10.0, 11.0]",10.87,1063,932.45,27.92,904.53,960.36
"(11.0, 11.93]",11.5,286,1001.24,54.15,947.09,1055.39
"(11.93, 12.5]",12.13,1115,1019.24,30.45,988.78,1049.69


In [186]:
df.head()
input_dtypes = pd.DataFrame(df.dtypes, columns=["Data Type"])
input_dtypes


Unnamed: 0,Data Type
timestamp,object
cnt,int64
t1,float64
t2,float64
hum,float64
wind_speed,float64
weather_code,float64
is_holiday,float64
is_weekend,float64
season,float64


In [188]:
fig = px.scatter(out, x='t1_mean', y='t1_cnt_mean',
                 trendline='lowess',trendline_color_override = 'gray',
                 color_discrete_sequence=['brown'])
fig.update_layout(title=dict(text="Mean value of bike shares: variable temperature 't1'"))
fig.update_traces(marker={'size': 13})
fig.show()


In [189]:
out1=out[['t1_cnt_mean']]
out1

Unnamed: 0_level_0,t1_cnt_mean
t1B,Unnamed: 1_level_1
"(-1.51, 4.0]",611.24
"(4.0, 5.0]",676.61
"(5.0, 6.5]",808.02
"(6.5, 7.5]",832.92
"(7.5, 8.0]",826.06
"(8.0, 9.0]",852.62
"(9.0, 10.0]",913.06
"(10.0, 11.0]",932.45
"(11.0, 11.93]",1001.24
"(11.93, 12.5]",1019.24


In [190]:
#df.head()    

In [191]:
# pd.merge(df,out1,left_on="t1B",right_index=True)   - this does not work
df1=df.join(out1,on='t1B')  # this appears to do the correct left join
df1.head(100)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,hour,t1B,t1_cnt_mean
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0,"(-1.51, 4.0]",611.24
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,1,"(-1.51, 4.0]",611.24
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2,"(-1.51, 4.0]",611.24
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,3,"(-1.51, 4.0]",611.24
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4,"(-1.51, 4.0]",611.24
5,2015-01-04 05:00:00,46,2.0,2.0,93.0,4.0,1.0,0.0,1.0,3.0,5,"(-1.51, 4.0]",611.24
6,2015-01-04 06:00:00,51,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,6,"(-1.51, 4.0]",611.24
7,2015-01-04 07:00:00,75,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,7,"(-1.51, 4.0]",611.24
8,2015-01-04 08:00:00,131,1.5,-1.0,96.5,8.0,4.0,0.0,1.0,3.0,8,"(-1.51, 4.0]",611.24
9,2015-01-04 09:00:00,301,2.0,-0.5,100.0,9.0,3.0,0.0,1.0,3.0,9,"(-1.51, 4.0]",611.24


In [192]:
#df.describe()

In [193]:
# Verification that it is an inner join and has worked
#df1.describe()

### Group bike shares by Hour of Day and join back into the file the mean shares by group 

In [198]:
df1['hourB'] = pd.qcut(df1["hour"],q=23,precision=2).copy()
#out=df1.groupby("hour")["cnt"].agg(['count','mean','std','sem'])
out=df1.groupby("hourB").agg(hour_mean=('hour', 'mean'),
                          count=('cnt','count'),
                          hour_cnt_mean=('cnt', 'mean'),                                                  
                          sem=('cnt','sem'))                               
#list(out)
out['lower']=np.subtract(out['hour_cnt_mean'],out['sem'])
out['upper']=np.add(out['hour_cnt_mean'],out['sem'])
out

Unnamed: 0_level_0,hour_mean,count,hour_cnt_mean,sem,lower,upper
hourB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-0.01, 1.0]",0.5,1448,245.62,4.58,241.04,250.21
"(1.0, 2.0]",2.0,721,136.3,4.16,132.14,140.46
"(2.0, 3.0]",3.0,721,94.25,3.02,91.23,97.26
"(3.0, 4.0]",4.0,721,73.31,1.72,71.59,75.04
"(4.0, 5.0]",5.0,721,110.71,1.3,109.41,112.01
"(5.0, 6.0]",6.0,726,466.63,9.93,456.7,476.55
"(6.0, 7.0]",7.0,726,1468.75,34.64,1434.11,1503.38
"(7.0, 8.0]",8.0,724,2882.82,66.09,2816.73,2948.91
"(8.0, 9.0]",9.0,727,1653.02,25.88,1627.14,1678.9
"(9.0, 10.0]",10.0,725,1064.66,12.26,1052.4,1076.92


In [199]:
out1=out[['hour_cnt_mean']]
#out1

In [200]:
df2=df1.join(out1,on='hourB')
df2.head(100)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,hour,t1B,t1_cnt_mean,hourB,hour_cnt_mean
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0,"(-1.51, 4.0]",611.24,"(-0.01, 1.0]",245.62
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,1,"(-1.51, 4.0]",611.24,"(-0.01, 1.0]",245.62
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2,"(-1.51, 4.0]",611.24,"(1.0, 2.0]",136.3
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,3,"(-1.51, 4.0]",611.24,"(2.0, 3.0]",94.25
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4,"(-1.51, 4.0]",611.24,"(3.0, 4.0]",73.31
5,2015-01-04 05:00:00,46,2.0,2.0,93.0,4.0,1.0,0.0,1.0,3.0,5,"(-1.51, 4.0]",611.24,"(4.0, 5.0]",110.71
6,2015-01-04 06:00:00,51,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,6,"(-1.51, 4.0]",611.24,"(5.0, 6.0]",466.63
7,2015-01-04 07:00:00,75,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,7,"(-1.51, 4.0]",611.24,"(6.0, 7.0]",1468.75
8,2015-01-04 08:00:00,131,1.5,-1.0,96.5,8.0,4.0,0.0,1.0,3.0,8,"(-1.51, 4.0]",611.24,"(7.0, 8.0]",2882.82
9,2015-01-04 09:00:00,301,2.0,-0.5,100.0,9.0,3.0,0.0,1.0,3.0,9,"(-1.51, 4.0]",611.24,"(8.0, 9.0]",1653.02


In [201]:
fig = px.scatter(out, x='hour_mean', y='hour_cnt_mean',
#                 trendline='lowess',trendline_color_override = 'gray',
                 color_discrete_sequence=['brown']).update_traces(mode="lines+markers")
fig.update_layout(title=dict(text="Mean value of bike shares: variable 'hour'"))
fig.update_traces(marker={'size': 13})
fig.show()

## Is Weekend

In [202]:
#df2['is_weekendB'] = pd.qcut(df2["is_weekend"],q=2,precision=2).copy()
df2['is_weekendB'] = df2["is_weekend"].copy()

out=df2.groupby("is_weekendB").agg(is_weekend_mean=('is_weekend', 'mean'),
                          count=('cnt','count'),
                          is_weekend_cnt_mean=('cnt', 'mean'),                                                  
                          sem=('cnt','sem'))                               
#list(out)
out['lower']=np.subtract(out['is_weekend_cnt_mean'],out['sem'])
out['upper']=np.add(out['is_weekend_cnt_mean'],out['sem'])
out

Unnamed: 0_level_0,is_weekend_mean,count,is_weekend_cnt_mean,sem,lower,upper
is_weekendB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,0.0,12444,1209.27,10.18,1199.09,1219.46
1.0,1.0,4970,977.42,13.12,964.29,990.54


In [203]:
out1=out[['is_weekend_cnt_mean']]
out1

Unnamed: 0_level_0,is_weekend_cnt_mean
is_weekendB,Unnamed: 1_level_1
0.0,1209.27
1.0,977.42


In [204]:
df3=df2.join(out1,on='is_weekendB')
df3.head(100)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,hour,t1B,t1_cnt_mean,hourB,hour_cnt_mean,is_weekendB,is_weekend_cnt_mean
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0,"(-1.51, 4.0]",611.24,"(-0.01, 1.0]",245.62,1.0,977.42
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,1,"(-1.51, 4.0]",611.24,"(-0.01, 1.0]",245.62,1.0,977.42
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2,"(-1.51, 4.0]",611.24,"(1.0, 2.0]",136.3,1.0,977.42
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,3,"(-1.51, 4.0]",611.24,"(2.0, 3.0]",94.25,1.0,977.42
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4,"(-1.51, 4.0]",611.24,"(3.0, 4.0]",73.31,1.0,977.42
5,2015-01-04 05:00:00,46,2.0,2.0,93.0,4.0,1.0,0.0,1.0,3.0,5,"(-1.51, 4.0]",611.24,"(4.0, 5.0]",110.71,1.0,977.42
6,2015-01-04 06:00:00,51,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,6,"(-1.51, 4.0]",611.24,"(5.0, 6.0]",466.63,1.0,977.42
7,2015-01-04 07:00:00,75,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,7,"(-1.51, 4.0]",611.24,"(6.0, 7.0]",1468.75,1.0,977.42
8,2015-01-04 08:00:00,131,1.5,-1.0,96.5,8.0,4.0,0.0,1.0,3.0,8,"(-1.51, 4.0]",611.24,"(7.0, 8.0]",2882.82,1.0,977.42
9,2015-01-04 09:00:00,301,2.0,-0.5,100.0,9.0,3.0,0.0,1.0,3.0,9,"(-1.51, 4.0]",611.24,"(8.0, 9.0]",1653.02,1.0,977.42


In [222]:
fig = px.scatter(out, x='is_weekend_mean', y='is_weekend_cnt_mean',
#                 trendline='lowess',trendline_color_override = 'gray',
                 color_discrete_sequence=['brown'])
fig.update_layout(title=dict(text="Mean value of bike shares: variable 'is_weekend'"))
fig.update_traces(marker={'size': 50})
fig.show()

## Multiple Linear Regression

In [230]:
# Perform a Multiple  Linear Regression

# Settings the warnings to be ignored 
warnings.filterwarnings('ignore') 

X=df3[['t1_cnt_mean','hour_cnt_mean','is_weekend_cnt_mean']]
#X=df3[['hour_cnt_mean']]
#X=df3[['hour_cnt_mean','is_weekend_cnt_mean']]
#X=df3[['t1_cnt_mean','hour_cnt_mean']]

y=df3['cnt']
regr = linear_model.LinearRegression()
regr.fit(X, y)
print(regr.coef_)
print(f'r_sqr value: {regr.score(X, y)}')


[0.65143989 0.92932475 0.9982346 ]
r_sqr value: 0.6856576796544995


In [219]:
# Correlations
df2['hour_cnt_mean'].corr(df2['t1_cnt_mean'])

0.2016804926402105