In [1]:
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch import nn as nn
import matplotlib.pyplot as plt
import functools as Fntl
from functools import reduce
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [2]:
df1 = pd.read_csv("data/from_book/hour.csv")

In [3]:
df1

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [4]:
df1.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

#### make 1-hot input features

In [9]:
pd.unique(df1['season'])

array([1, 2, 3, 4])

In [11]:
pd.unique(df1['mnth'])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [12]:
pd.unique(df1['hr'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [13]:
pd.unique(df1['holiday'])

array([0, 1])

binary, keep as is

In [14]:
pd.unique(df1['weekday'])

array([6, 0, 1, 2, 3, 4, 5])

In [17]:
pd.unique(df1['workingday'])

array([0, 1])

binary, keep as is.

In [16]:
pd.unique(df1['weathersit'])

array([1, 2, 3, 4])

In [19]:
dummy_fields = ['season', 'mnth', 'hr', 'weekday', 'weathersit']

In [31]:
df_inp = Fntl.reduce(lambda acc, x: pd.concat([acc,
                                                pd.get_dummies(df1[x],
                                                               prefix=x,
                                                               drop_first=False)],
                                               axis=1),
                      dummy_fields,
                      pd.DataFrame())

In [32]:
df_inp

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
17375,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
17376,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
17377,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [33]:
df_inp.columns

Index(['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9',
       'mnth_10', 'mnth_11', 'mnth_12', 'hr_0', 'hr_1', 'hr_2', 'hr_3', 'hr_4',
       'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10', 'hr_11', 'hr_12',
       'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18', 'hr_19', 'hr_20',
       'hr_21', 'hr_22', 'hr_23', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4'],
      dtype='object')

#### inclue binary features

In [34]:
binary_fields = ['holiday', 'workingday']

In [35]:
df_inp = Fntl.reduce(lambda acc, x: pd.concat([acc, df1[x]],
                                               axis=1),
                      binary_fields,
                      df_inp)

In [36]:
df_inp

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_1,weathersit_2,weathersit_3,weathersit_4,holiday,workingday
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
17375,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
17376,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
17377,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [37]:
df_inp.columns

Index(['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9',
       'mnth_10', 'mnth_11', 'mnth_12', 'hr_0', 'hr_1', 'hr_2', 'hr_3', 'hr_4',
       'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10', 'hr_11', 'hr_12',
       'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18', 'hr_19', 'hr_20',
       'hr_21', 'hr_22', 'hr_23', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4', 'holiday',
       'workingday'],
      dtype='object')

#### normalize the quantitative data

In [38]:
qnt_fields = ['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']

In [39]:
feature_statistics = {}
df_qnt = pd.DataFrame()
for k in qnt_fields:
    mean, std = (df1[k].mean(), df1[k].std())
    feature_statistics[k] = (mean, std)
    df_qnt[k] = (df1[k] - mean) / std

In [40]:
df_qnt

Unnamed: 0,temp,atemp,hum,windspeed,casual,registered,cnt
0,-1.334609,-1.093249,0.947345,-1.553844,-0.662736,-0.930162,-0.956312
1,-1.438475,-1.181698,0.895513,-1.553844,-0.561326,-0.804632,-0.823998
2,-1.438475,-1.181698,0.895513,-1.553844,-0.622172,-0.837666,-0.868103
3,-1.334609,-1.093249,0.636351,-1.553844,-0.662736,-0.949983,-0.972851
4,-1.334609,-1.093249,0.636351,-1.553844,-0.723582,-1.009445,-1.039008
...,...,...,...,...,...,...,...
17374,-1.230743,-1.269565,-0.141133,-0.211685,-0.500481,-0.302509,-0.388467
17375,-1.230743,-1.269565,-0.141133,-0.211685,-0.561326,-0.480894,-0.553859
17376,-1.230743,-1.269565,-0.141133,-0.211685,-0.581608,-0.467681,-0.548346
17377,-1.230743,-1.181698,-0.348463,-0.456086,-0.459917,-0.698922,-0.708224


#### add quantitative data into df_inp & df_tgt

In [41]:
qnt_features = ['temp', 'atemp', 'hum', 'windspeed']
qnt_tgts = ['casual', 'registered', 'cnt']

In [42]:
df_inp = pd.concat([df_inp, df_qnt[qnt_features]],
                   axis=1)

In [43]:
df_inp

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weathersit_1,weathersit_2,weathersit_3,weathersit_4,holiday,workingday,temp,atemp,hum,windspeed
0,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,-1.334609,-1.093249,0.947345,-1.553844
1,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,-1.438475,-1.181698,0.895513,-1.553844
2,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,-1.438475,-1.181698,0.895513,-1.553844
3,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,-1.334609,-1.093249,0.636351,-1.553844
4,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,-1.334609,-1.093249,0.636351,-1.553844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,-1.230743,-1.269565,-0.141133,-0.211685
17375,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,-1.230743,-1.269565,-0.141133,-0.211685
17376,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,-1.230743,-1.269565,-0.141133,-0.211685
17377,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,-1.230743,-1.181698,-0.348463,-0.456086


In [44]:
df_inp.columns

Index(['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9',
       'mnth_10', 'mnth_11', 'mnth_12', 'hr_0', 'hr_1', 'hr_2', 'hr_3', 'hr_4',
       'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'hr_10', 'hr_11', 'hr_12',
       'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18', 'hr_19', 'hr_20',
       'hr_21', 'hr_22', 'hr_23', 'weekday_0', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4', 'holiday', 'workingday',
       'temp', 'atemp', 'hum', 'windspeed'],
      dtype='object')

In [45]:
df_tgt = df_qnt['cnt']

In [46]:
df_tgt

0       -0.956312
1       -0.823998
2       -0.868103
3       -0.972851
4       -1.039008
           ...   
17374   -0.388467
17375   -0.553859
17376   -0.548346
17377   -0.708224
17378   -0.774381
Name: cnt, Length: 17379, dtype: float64

#### make the train/test inp/tgt tensors

In [47]:
train_x = torch.tensor(df_inp[:-21*24].values).float()
test_x = torch.tensor(df_inp[-21*24:].values).float()

In [48]:
train_x.shape

torch.Size([16875, 57])

In [49]:
test_x.shape

torch.Size([504, 57])

In [51]:
tsr_tgt = torch.tensor(df_tgt.values.reshape(len(df_tgt),1)).float()

In [54]:
train_y = tsr_tgt[:-21*24,:]

In [55]:
test_y = tsr_tgt[-21*24:,:]

In [56]:
train_y.shape

torch.Size([16875, 1])

In [57]:
test_y.shape

torch.Size([504, 1])