In [1]:
# !pip install matplotlib


In [2]:
from pandas import read_csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from numpy import sqrt, log1p
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from math import sin, cos, pi
import warnings
import pickle
warnings.filterwarnings('ignore')
from pathlib import Path

In [3]:
base_path = Path(r'C:\Users\user\Documents\Ariel\mlops_course\Bike_sharing\data')

For more details about this dataset, see [Bike Sharing Demand][1] competition page on Kaggle.

[1]: https://www.kaggle.com/c/bike-sharing-demand "Bike Sharing Demand"

In [4]:
dataset = read_csv(base_path/'bike sharing train.csv', parse_dates=[0], index_col=0)
dataset.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


Create datetime columns

In [5]:
def explode_dt(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['hour'] = df.index.hour
    df['weekday'] = df.index.weekday
    return df

In [6]:
dataset = explode_dt(dataset)
dataset.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour,weekday
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,0,5
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,5
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,2,5
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,3,5
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,4,5


Convert hours to circle data

In [7]:
#x=sin(2pi*hour/24)
#y=cos(2pi*hour/24)

dataset['hour_x'] = dataset['hour'].apply(lambda x: sin(2*pi*x/24))
dataset['hour_y'] = dataset['hour'].apply(lambda x: cos(2*pi*x/24))

drop uneeded columns

In [8]:
drop_columns = ['atemp','season','holiday','weekday','hour']
dataset.drop(drop_columns, axis=1, inplace=True)

In [9]:
dataset.head()

Unnamed: 0_level_0,workingday,weather,temp,humidity,windspeed,casual,registered,count,year,month,hour_x,hour_y
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-01-01 00:00:00,0,1,9.84,81,0.0,3,13,16,2011,1,0.0,1.0
2011-01-01 01:00:00,0,1,9.02,80,0.0,8,32,40,2011,1,0.258819,0.965926
2011-01-01 02:00:00,0,1,9.02,80,0.0,5,27,32,2011,1,0.5,0.866025
2011-01-01 03:00:00,0,1,9.84,75,0.0,3,10,13,2011,1,0.707107,0.707107
2011-01-01 04:00:00,0,1,9.84,75,0.0,0,1,1,2011,1,0.866025,0.5


Build 2 models, 1 for casual and one for regritered

In [10]:
prdict_colunms = ['year','month','hour_x','hour_y','temp','humidity','windspeed']

In [11]:
X = dataset[prdict_colunms]
y_casual = dataset['casual']
y_registered = dataset['registered']



casual

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y_casual, test_size=0.2)

In [13]:
X_train.columns

Index(['year', 'month', 'hour_x', 'hour_y', 'temp', 'humidity', 'windspeed'], dtype='object')

In [14]:
model_casual = RandomForestRegressor(n_estimators=100, random_state=0)
model_casual.fit(X_train, y_train)

In [15]:
mean_squared_error(y_test, model_casual.predict(X_test))

851.7025979633282

registered

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_registered, test_size=0.2)

In [17]:
model_registered = RandomForestRegressor(n_estimators=100, random_state=0)
model_registered.fit(X_train, y_train)

In [18]:
mean_squared_error(y_test, model_registered.predict(X_test))


5654.919762947174

In [19]:

# final model
def prdict_count(data):
  temp_dataset = data[prdict_colunms]

  casual = model_casual.predict(temp_dataset)
  registered = model_registered.predict(temp_dataset)

  return casual+registered

In [20]:
final_result = prdict_count(dataset)


In [21]:
mean_squared_error(dataset['count'],final_result)


2269.780716624963

Save result

In [22]:
# Save the models using pickle
with open(base_path/'model_registered.pkl', 'wb') as f:
    pickle.dump(model_registered, f)

with open(base_path/'model_casual.pkl', 'wb') as f:
    pickle.dump(model_casual, f)
