In [1]:
# imports
import pandas as pd
import numpy as np
import datetime as dt


In [2]:
# read dataset from bike_rentals.csv
df = pd.read_csv("bike_rentals.csv")

In [3]:
# display the data
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


# correcting null values

In [4]:
# find number of null values for 1) each row 2) column 3) whole dataset
print(df.isna().sum(axis=1))
print(df.isna().sum(axis=0))
print(df.isna().sum().sum())

0      0
1      0
2      0
3      0
4      0
      ..
726    0
727    0
728    0
729    0
730    2
Length: 731, dtype: int64
instant       0
dteday        0
season        0
yr            1
mnth          1
holiday       0
weekday       0
workingday    0
weathersit    0
temp          1
atemp         1
hum           3
windspeed     5
casual        0
registered    0
cnt           0
dtype: int64
12


In [5]:
# displaying columns / rows containing null values
df.loc[df.isna().any(axis=1),:]
#df.loc[:,df.isna().any(axis=0)]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
56,57,2011-02-26,1.0,0.0,2.0,0.0,6.0,0.0,1,0.2825,0.282192,0.537917,,424,1545,1969
81,82,2011-03-23,2.0,0.0,3.0,0.0,3.0,1.0,2,0.346957,0.337939,0.839565,,203,1918,2121
128,129,2011-05-09,2.0,0.0,5.0,0.0,1.0,1.0,1,0.5325,0.525246,0.58875,,664,3698,4362
129,130,2011-05-10,2.0,0.0,5.0,0.0,2.0,1.0,1,0.5325,0.522721,,0.115671,694,4109,4803
213,214,2011-08-02,3.0,0.0,8.0,0.0,2.0,1.0,1,0.783333,0.707071,,0.20585,801,4044,4845
298,299,2011-10-26,4.0,0.0,10.0,0.0,3.0,1.0,2,0.484167,0.472846,0.720417,,404,3490,3894
388,389,2012-01-24,1.0,1.0,1.0,0.0,2.0,1.0,1,0.3425,0.349108,,0.123767,439,3900,4339
528,529,2012-06-12,2.0,1.0,6.0,0.0,2.0,1.0,2,0.653333,0.597875,0.833333,,477,4495,4972
701,702,2012-12-02,4.0,1.0,12.0,0.0,0.0,0.0,2,,,0.823333,0.124379,892,3757,4649
730,731,2012-12-31,1.0,,,0.0,1.0,0.0,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


## replace by median/mean

In [6]:
# replace the null value in the "windspeed" column by the median in the column
df["windspeed"].fillna(value=df.windspeed.median(),inplace=True)

In [7]:
# check if row 56 and 81's windspeed git rid of null
df.loc[[56,81],["instant","windspeed"]]

Unnamed: 0,instant,windspeed
56,57,0.180971
81,82,0.180971


## Groupby with the median/mean

In [8]:
# groupby the dataset by season with mean() aggregate function
df.groupby(by="season").mean()

Unnamed: 0_level_0,instant,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,262.685083,0.5,3.044444,0.038674,3.0,0.657459,1.40884,0.297748,0.296914,0.581498,0.214506,334.928177,2269.20442,2604.132597
2.0,308.5,0.5,4.652174,0.021739,2.98913,0.695652,1.402174,0.544405,0.520307,0.627701,0.203062,1106.097826,3886.233696,4992.331522
3.0,401.5,0.5,7.691489,0.021277,3.031915,0.696809,1.297872,0.706309,0.655898,0.634243,0.172095,1202.611702,4441.691489,5644.303191
4.0,493.0,0.5,10.696629,0.033708,2.966292,0.679775,1.477528,0.423332,0.415857,0.668719,0.172176,729.11236,3999.050562,4728.162921


In [9]:
# replace the hum (humidity) null values by seasonal median

# use transform function the obtain required value
df.groupby(by="season")["hum"].transform("median")

# perform the replace
df.hum.fillna(value=df.groupby(by="season")["hum"].transform("median"),inplace=True)

In [10]:
# check the 129, 213, 388 hum values has been filled
df.loc[[129,213,388],["instant","hum","season"]]

Unnamed: 0,instant,hum,season
129,130,0.646667,2.0
213,214,0.635833,3.0
388,389,0.54375,1.0


## obtaining the median/mean from specific rows

In [11]:
# find null values of temp column
df.loc[df.temp.isna(),"temp"]

701   NaN
Name: temp, dtype: float64

In [12]:
# find null values of atemp column
df.loc[df.atemp.isna(),"atemp"]

701   NaN
Name: atemp, dtype: float64

In [13]:
# replace both temp and atemp null values by the average between the day before and after
index = df.loc[df.temp.isna()].index.item()
tempval = np.mean([df.loc[index-1,"temp"],df.loc[index+1,"temp"]])
atempval = np.mean([df.loc[index-1,"atemp"],df.loc[index+1,"atemp"]])
df.temp.fillna(value=tempval,inplace=True)
df.atemp.fillna(value=atempval,inplace=True)

In [14]:
# check if the values have been replaced
df.loc[index-1:index+1,["instant","temp","atemp"]]


Unnamed: 0,instant,temp,atemp
700,701,0.298333,0.316904
701,702,0.375417,0.38635
702,703,0.4525,0.455796


## extrapolate dates

In [15]:
# check which column contains dates, and what type of value is stored
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    float64
 3   yr          730 non-null    float64
 4   mnth        730 non-null    float64
 5   holiday     731 non-null    float64
 6   weekday     731 non-null    float64
 7   workingday  731 non-null    float64
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(10), int64(5), object(1)
memory usage: 91.5+ KB


In [16]:
# convert the dteday values from object to "datetime" type
df.dteday = pd.to_datetime(df.dteday,infer_datetime_format=True)

In [17]:
# verify that the type is converted
df.dteday

0     2011-01-01
1     2011-01-02
2     2011-01-03
3     2011-01-04
4     2011-01-05
         ...    
726   2012-12-27
727   2012-12-28
728   2012-12-29
729   2012-12-30
730   2012-12-31
Name: dteday, Length: 731, dtype: datetime64[ns]

In [18]:
# rewrite values in "mnth" to the months value inside dteday 
#    so information is consistant
df.mnth = df["dteday"].dt.month #note df["dteday"].datetime.month doesn't work

In [19]:
# check if there is any null value in "mnth"
df.mnth.isna().any()

# check the last 5 obsevations in the dataframe
df.tail()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
726,727,2012-12-27,1.0,1.0,12,0.0,4.0,1.0,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1.0,1.0,12,0.0,5.0,1.0,2,0.253333,0.255046,0.59,0.155471,644,2451,3095
728,729,2012-12-29,1.0,1.0,12,0.0,6.0,0.0,2,0.253333,0.2424,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1.0,1.0,12,0.0,0.0,0.0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
730,731,2012-12-31,1.0,,12,0.0,1.0,0.0,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


In [20]:
# correct year values of last observation by setting it to 1 
df.loc[730,"yr"]=1

In [21]:
df.tail()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
726,727,2012-12-27,1.0,1.0,12,0.0,4.0,1.0,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1.0,1.0,12,0.0,5.0,1.0,2,0.253333,0.255046,0.59,0.155471,644,2451,3095
728,729,2012-12-29,1.0,1.0,12,0.0,6.0,0.0,2,0.253333,0.2424,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1.0,1.0,12,0.0,0.0,0.0,1,0.255833,0.2317,0.483333,0.350754,364,1432,1796
730,731,2012-12-31,1.0,1.0,12,0.0,1.0,0.0,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


In [22]:
# find out which columns are numerical and determine if we can delete it
df.info()
df = df.drop("dteday",axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     731 non-null    int64         
 1   dteday      731 non-null    datetime64[ns]
 2   season      731 non-null    float64       
 3   yr          731 non-null    float64       
 4   mnth        731 non-null    int64         
 5   holiday     731 non-null    float64       
 6   weekday     731 non-null    float64       
 7   workingday  731 non-null    float64       
 8   weathersit  731 non-null    int64         
 9   temp        731 non-null    float64       
 10  atemp       731 non-null    float64       
 11  hum         731 non-null    float64       
 12  windspeed   731 non-null    float64       
 13  casual      731 non-null    int64         
 14  registered  731 non-null    int64         
 15  cnt         731 non-null    int64         
dtypes: datetime64[ns](1), floa

In [23]:
# drop the two columns called "casual" and "registered" as they are known inputs
#  for predicting the total rentals = casual + registered rentals
df = df.drop(["casual","registered"],axis=1)

In [None]:
# save data for future use
#df.to_csv("clean_data.csv",index=False)

# Linear regression

In [25]:
# splitting data into X and y 

# find out which column is y
df.tail() # last one!

# split
X = df.iloc[:,:-1]
y = df["cnt"]


## use scikit-learn

In [30]:
# import relevant modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [33]:
# split the data into training set and test set with 25% (which is default) test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                   random_state=2)

## build and fit model

In [31]:
# initialize a linear regression model
model = LinearRegression()

In [34]:
# fit the model on the training set
model.fit(X_train,y_train)

LinearRegression()

In [35]:
# make predictions for the test set
y_pred = model.predict(X_test)

## calculate loss with RMSE

In [36]:
# import mean_square_error to compute mean squared loss
from sklearn.metrics import mean_squared_error

# compute mse
mse = mean_squared_error(y_test,y_pred)

# compute root mse
rmse = np.sqrt(mse)

# print: RMSE: value
print(f"RMSE: {rmse:.2f}")

RMSE: 898.21


In [37]:
# determine if this error is big or small with cnt data
df.cnt.describe()

count     731.000000
mean     4504.348837
std      1937.211452
min        22.000000
25%      3152.000000
50%      4548.000000
75%      5956.000000
max      8714.000000
Name: cnt, dtype: float64

# XGBoosting

In [43]:
# import XGBoostint regressor
from xgboost import XGBRegressor

In [48]:
# build xgboosting model with similar steps
model2 = XGBRegressor()

model2.fit(X_train, y_train)

y_pred2 = model2.predict(X_test)

mse2 = mean_squared_error(y_test,y_pred2)

rmse2 = np.sqrt(mse2)

print(f"RMSE from Linear Regression: {rmse:.2f}")
print(f"RMSE from XGBoosting: {rmse2:.2f}")
print(f"Wee! XGBoosting reduced error by {(rmse-rmse2)/rmse*100:.0f}%")

RMSE from Linear Regression: 898.21
RMSE from XGBoosting: 705.11
Wee! XGBoosting reduced error by 21%


# Cross-validation

In [52]:
# import cross validation score library
from sklearn.model_selection import cross_val_score

In [56]:
# implement 10-fold cross validation on linear regression model

# initize model
model1 = LinearRegression()

# record the scores
scores1 = cross_val_score(model1, X, y, scoring="neg_mean_squared_error",cv=10)

# find the RMSEs
rmse1 = np.sqrt(-scores1)

# display the results
print("results for Linear Regression: ", np.round(rmse1,2))
print("the average is %0.2f" % (np.mean(rmse1)))

results for Linear Regression:  [ 504.01  840.55 1140.88  728.39  640.2   969.95 1133.45 1252.85 1084.64
 1425.33]
the average is 972.02


In [59]:
# implement 10-fold cross validation on XGBoosting

model2 = XGBRegressor()

scores2 = cross_val_score(model2,X,y,scoring="neg_mean_squared_error",cv=10)

rmse2 = np.sqrt(-scores2)

print("results for XGBoosting: \n", np.round(rmse2,2))
print("average RMSE: %.2f" % np.mean(rmse2))

results for XGBoosting: 
 [ 717.65  692.8   520.7   737.68  835.96 1006.24  991.34  747.61  891.99
 1731.13]
average RMSE: 887.31
