In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
import pandas as pd
# loading the dataset
day_ds = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/day.csv')
hour_ds = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/hour.csv')

In [41]:
hour_ds

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [42]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn import preprocessing

# preprocessing the data

# extracting day of week
hour_ds['dteday'] = pd.to_datetime(hour_ds['dteday'])
hour_ds['day of week'] = hour_ds['dteday'].dt.dayofweek

# extracting cyclical features using sine and cosine transformations
cyclical_features = ['season', 'mnth', 'weekday', 'hr']
for feature in cyclical_features:
  hour_ds[f'{feature}_sin'] = np.sin(2 * np.pi * hour_ds[feature] / hour_ds[feature].max())
  hour_ds[f'{feature}_cos'] = np.cos(2 * np.pi * hour_ds[feature] / hour_ds[feature].max())

# creating binary variables for time indicators
hour_ds['afternoon'] = np.where((hour_ds['hr'] >= 12) & (hour_ds['hr'] <= 18), 1, 0)
hour_ds['working hour'] = np.where((hour_ds['hr'] >= 9) & (hour_ds['hr'] <= 17), 1, 0)
hour_ds['holiday'] = np.where(hour_ds['holiday'] == 1, 1, 0)
hour_ds['working day'] = np.where(hour_ds['workingday'] == 1, 1, 0)
hour_ds['month start'] = np.where(hour_ds['dteday'].dt.is_month_start, 1, 0)
hour_ds['quarter start'] = np.where(hour_ds['dteday'].dt.is_quarter_start, 1, 0)

# creating lagged values
target = ['cnt', 'casual', 'registered'] # target variables

# lagged (recent)
for feature in target:
  for i in range(1, 4):
    hour_ds[f'{feature}_lag{i}'] = hour_ds[feature].shift(i)

# lagged (distant)
for feature in target:
  for i in [24, 48, 168]:
    hour_ds[f'{feature}_lag{i}'] = hour_ds[feature].shift(i)

print(hour_ds.shape)
hour_df = hour_ds.dropna()
print(hour_df.shape)

(17379, 49)
(17211, 49)


In [43]:
hour_df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,...,registered_lag3,cnt_lag24,cnt_lag48,cnt_lag168,casual_lag24,casual_lag48,casual_lag168,registered_lag24,registered_lag48,registered_lag168
168,169,2011-01-08,1,0,1,7,0,6,0,2,...,1.0,84.0,36.0,16.0,8.0,0.0,3.0,76.0,36.0,13.0
169,170,2011-01-08,1,0,1,8,0,6,0,3,...,5.0,210.0,95.0,40.0,20.0,0.0,8.0,190.0,95.0,32.0
170,171,2011-01-08,1,0,1,9,0,6,0,3,...,2.0,134.0,219.0,32.0,9.0,3.0,5.0,125.0,216.0,27.0
171,172,2011-01-08,1,0,1,10,0,6,0,2,...,8.0,63.0,122.0,13.0,16.0,6.0,3.0,47.0,116.0,10.0
172,173,2011-01-08,1,0,1,11,0,6,0,2,...,15.0,67.0,45.0,1.0,19.0,3.0,0.0,48.0,42.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,...,184.0,102.0,97.0,26.0,16.0,7.0,6.0,86.0,90.0,20.0
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,...,150.0,72.0,66.0,18.0,9.0,2.0,4.0,63.0,64.0,14.0
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,...,112.0,47.0,60.0,23.0,5.0,4.0,6.0,42.0,56.0,17.0
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,...,108.0,36.0,54.0,22.0,6.0,3.0,13.0,30.0,51.0,9.0


In [44]:
# scaling
scaler = preprocessing.MinMaxScaler()
scaled_features = [
    'season_sin',
    'season_cos',
    'mnth_sin',
    'mnth_cos',
    'weekday_sin',
    'weekday_cos',
    'hr_sin',
    'hr_cos',
    'cnt',
    'casual',
    'registered',
    'cnt_lag1',
    'cnt_lag2',
    'cnt_lag3',
    'cnt_lag24',
    'cnt_lag48',
    'cnt_lag168'
]

scaled_df = pd.DataFrame(scaler.fit_transform(hour_df[scaled_features]), columns=scaled_features)
hour_df = pd.concat([hour_df, scaled_df], axis=1)

# selecting relevant features
final_cols = [
    'season_sin',
    'season_cos',
    'day of week',
    'mnth_sin',
    'mnth_cos',
    'weekday_sin',
    'weekday_cos',
    'hr_sin',
    'hr_cos',
    'holiday',
    'afternoon',
    'working hour',
    'working day',
    'month start',
    'quarter start',
    'cnt_lag1',
    'cnt_lag2',
    'cnt_lag3',
    'cnt_lag24',
    'cnt_lag48',
    'cnt_lag168'
]

target = 'cnt'

hour_df_final = hour_df[final_cols + [target]]
hour_df_final

Unnamed: 0,season_sin,season_sin.1,season_cos,season_cos.1,day of week,mnth_sin,mnth_sin.1,mnth_cos,mnth_cos.1,weekday_sin,...,cnt_lag3,cnt_lag3.1,cnt_lag24,cnt_lag24.1,cnt_lag48,cnt_lag48.1,cnt_lag168,cnt_lag168.1,cnt,cnt.1
168,1.0,1.0,6.123234e-17,0.5,5.0,0.5,0.75,0.866025,0.933013,-2.449294e-16,...,1.0,0.032787,84.0,0.061475,36.0,0.036885,16.0,0.008197,9.0,0.090164
169,1.0,1.0,6.123234e-17,0.5,5.0,0.5,0.75,0.866025,0.933013,-2.449294e-16,...,5.0,0.059426,210.0,0.077869,95.0,0.052254,40.0,0.014344,15.0,0.102459
170,1.0,1.0,6.123234e-17,0.5,5.0,0.5,0.75,0.866025,0.933013,-2.449294e-16,...,2.0,0.072746,134.0,0.064549,219.0,0.084016,32.0,0.019467,20.0,0.119877
171,1.0,1.0,6.123234e-17,0.5,5.0,0.5,0.75,0.866025,0.933013,-2.449294e-16,...,9.0,0.090164,63.0,0.068648,122.0,0.042008,13.0,0.061475,61.0,0.131148
172,1.0,1.0,6.123234e-17,0.5,5.0,0.5,0.75,0.866025,0.933013,-2.449294e-16,...,15.0,0.102459,67.0,0.091189,45.0,0.060451,1.0,0.062500,62.0,0.130123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,,1.0,,0.5,,,0.75,,0.933013,,...,,0.004098,,0.070697,,0.027664,,0.015369,,0.009221
164,,1.0,,0.5,,,0.75,,0.933013,,...,,0.000000,,0.160861,,0.072746,,0.006148,,0.022541
165,,1.0,,0.5,,,0.75,,0.933013,,...,,0.002049,,0.118852,,0.205943,,0.000000,,0.032787
166,,1.0,,0.5,,,0.75,,0.933013,,...,,0.009221,,0.044057,,0.141393,,0.004098,,0.059426


In [46]:
print(hour_df_final.shape)
hour_final = hour_df_final.dropna()
print(hour_final.shape)

(17379, 37)
(17043, 37)


In [50]:
from sklearn.model_selection import train_test_split

# splitting the data into train and test
train_sets = []
test_sets = []

# bootstrap sampling
bootstrap_samples = 50
for _ in range(bootstrap_samples):
  X = hour_final[final_cols]
  y = hour_final[target]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=np.random.randint(1, 1000))

  train_sets.append((X_train, y_train))
  test_sets.append((X_test, y_test))

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam

# building the baseline LSTM model
