In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datetime import datetime, date

In [49]:
"""
    Data Source : https://archive.ics.uci.edu/ml/datasets/Seoul+Bike+Sharing+Demand
    Dataset link : https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv
"""

bikeData = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv", encoding='latin1')

In [50]:
bikeData.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [51]:
# Overview of Data
bikeData.describe()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,704.602055,11.5,12.882922,58.226256,1.724909,1436.825799,4.073813,0.569111,0.148687,0.075068
std,644.997468,6.922582,11.944825,20.362413,1.0363,608.298712,13.060369,0.868746,1.128193,0.436746
min,0.0,0.0,-17.8,0.0,0.0,27.0,-30.6,0.0,0.0,0.0
25%,191.0,5.75,3.5,42.0,0.9,940.0,-4.7,0.0,0.0,0.0
50%,504.5,11.5,13.7,57.0,1.5,1698.0,5.1,0.01,0.0,0.0
75%,1065.25,17.25,22.5,74.0,2.3,2000.0,14.8,0.93,0.0,0.0
max,3556.0,23.0,39.4,98.0,7.4,2000.0,27.2,3.52,35.0,8.8


In [52]:
# Counting null values
bikeData.isna().sum()

Date                         0
Rented Bike Count            0
Hour                         0
Temperature(°C)              0
Humidity(%)                  0
Wind speed (m/s)             0
Visibility (10m)             0
Dew point temperature(°C)    0
Solar Radiation (MJ/m2)      0
Rainfall(mm)                 0
Snowfall (cm)                0
Seasons                      0
Holiday                      0
Functioning Day              0
dtype: int64

In [53]:
# inspecting column names
bikeData.columns

Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [54]:
# rename the column names, which is more convinient
bikeData.rename(columns={
    "Rented Bike Count": "rented_bike_count", "Temperature(∞C)": "celcious_temperature", "Humidity(%)": "percentage_humidity", 
    "Wind speed (m/s)": "wind_speed", "Visibility (10m)": "visibility", "Dew point temperature(∞C)": "dew_point_temperature_celcious",
    "Solar Radiation (MJ/m2)": "solar_radiation", "Rainfall(mm)": "rainfall", "Snowfall (cm)": "snowfall_cm", "Seasons": "seasons",
    "Holiday": "holiday", "Hour":"hour", "Date": "date", "Functioning Day": "functional_day"
}, inplace=True)

bikeData.columns

Index(['date', 'rented_bike_count', 'hour', 'Temperature(°C)',
       'percentage_humidity', 'wind_speed', 'visibility',
       'Dew point temperature(°C)', 'solar_radiation', 'rainfall',
       'snowfall_cm', 'seasons', 'holiday', 'functional_day'],
      dtype='object')

In [59]:
bikeData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
date                         8760 non-null object
rented_bike_count            8760 non-null int64
hour                         8760 non-null int64
Temperature(°C)              8760 non-null float64
percentage_humidity          8760 non-null int64
wind_speed                   8760 non-null float64
visibility                   8760 non-null int64
Dew point temperature(°C)    8760 non-null float64
solar_radiation              8760 non-null float64
rainfall                     8760 non-null float64
snowfall_cm                  8760 non-null float64
seasons                      8760 non-null object
holiday                      8760 non-null object
functional_day               8760 non-null object
dtypes: float64(6), int64(4), object(4)
memory usage: 958.2+ KB


In [66]:
# Analysing daily average - request for the bike

# collecting daily data
hourlyBikeRequest = bikeData[['date', "rented_bike_count"]]
monthlyBikeRequest = hourlyBikeRequest.groupby("date", as_index=False).sum()  # as_index=False > don't let the date column as the index of genrating dataframe
monthlyBikeRequest["date"] = pd.to_datetime(monthlyBikeRequest['date'])
monthlyBikeRequest2018 = mon

# # ploting entire data looks messy, lets split the data into years
# plt.figure(figsize=(20, 12))
# sns.lineplot(data=dailyBikeRequest, x='date', y='rented_bike_count', legend='brief')
# plt.show()



Unnamed: 0,date,rented_bike_count
0,2018-01-01,4290
1,2018-01-02,5377
2,2018-01-03,5132
3,2018-01-04,17388
4,2018-01-05,26820
...,...,...
360,2018-05-31,31681
361,2018-07-31,22897
362,2018-08-31,27817
363,2018-10-31,21545
