In [316]:
import pandas as pd
# Convert the data to (0) per-two-hour (1) per-hour, (2) per-30-minute, (3) per-15-minute bin
# ONLY on pickup
d = pd.read_csv('trips_simpler.csv')
d['pickup_datetime'] = pd.to_datetime(d['pickup_datetime'])
d = d.set_index(['pickup_datetime'])
d = d['n']
# 2-hour bin
d2h = d.resample('2H').count()

# 1-hour bin
d1h = d.resample('1H').count()

# 30-min bin
d30m = d.resample('30T').count()

# 15-min bin
d15m = d.resample('15T').count()

In [261]:
d2h.head(12)

pickup_datetime
2019-07-01 00:00:00    2674
2019-07-01 02:00:00    1522
2019-07-01 04:00:00    1524
2019-07-01 06:00:00    2861
2019-07-01 08:00:00    3840
2019-07-01 10:00:00    2911
2019-07-01 12:00:00    2898
2019-07-01 14:00:00    2947
2019-07-01 16:00:00    3044
2019-07-01 18:00:00    3073
2019-07-01 20:00:00    3206
2019-07-01 22:00:00    3049
Freq: 2H, Name: n, dtype: int64

In [323]:
def unnegate(num):
    if num < 0:
        num = 0
    return num

# reshape from [samples, timesteps] into [samples, subsequences, timesteps, features]
def reshapeData(dat):
    dat = array(dat)
    dat = dat.reshape((dat.shape[0], 1, dat.shape[1], 1))
    print(dat.shape)
    return dat

def training_builder(dset, window, step, train_ratio): # e.g. window = 5, step = 2: [1 2 3 4 5] -> 6 7
    d = []
    idx = -window + 1
    for i in range(len(dset)):
        dj = []
        dk = []
        for j in range(window):
            dj.append(dset[unnegate(idx + j)])
        for k in range(step):
            try:
                dk.append(dset[unnegate(idx + window + k)])
            except:
                break
        d.append([dj,dk])
        idx += 1
        result = pd.DataFrame(d[window//2:-step], columns=['inp','outp'])
        inp = result['inp'].tolist()
        outp = result['outp'].tolist()
    inp_train = reshapeData(inp[:int(len(inp)*train_ratio)])
    out_train = reshapeData(outp[:int(len(outp)*train_ratio)])
    inp_test = reshapeData(inp[int(len(inp)*train_ratio):])
    out_test = reshapeData(outp[int(len(outp)*train_ratio):])
    return inp_train, out_train, inp_test, out_test
    
l2h = d2h.tolist()
l1h = d1h.tolist()
l30m = d30m.tolist()
l15m = d15m.tolist()

In [324]:
window = 12
timeframe = 2
train_ratio = 0.8
inp_train, out_train, inp_test, out_test = training_builder(l2h, window, timeframe, train_ratio)

(281, 1, 12, 1)
(281, 1, 2, 1)
(71, 1, 12, 1)
(71, 1, 2, 1)


In [325]:
# ML part
from numpy import array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

In [326]:
model = Sequential()
model.add(TimeDistributed(Conv1D(filters=128, kernel_size=5, activation='relu'), input_shape=(None, window, 1)))
model.add(TimeDistributed(MaxPooling1D(pool_size=4)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(200, activation='relu'))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')

model.summary()

Model: "sequential_38"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_106 (TimeDi (None, None, 8, 128)      768       
_________________________________________________________________
time_distributed_107 (TimeDi (None, None, 2, 128)      0         
_________________________________________________________________
time_distributed_108 (TimeDi (None, None, 256)         0         
_________________________________________________________________
lstm_33 (LSTM)               (None, 200)               365600    
_________________________________________________________________
dense_33 (Dense)             (None, 2)                 402       
Total params: 366,770
Trainable params: 366,770
Non-trainable params: 0
_________________________________________________________________


In [330]:
# Train model
model.fit(inp_train, out_train, epochs=750, verbose=0)

[[[[2674]
   [2674]
   [2674]
   ...
   [3840]
   [2911]
   [2898]]]


 [[[2674]
   [2674]
   [2674]
   ...
   [2911]
   [2898]
   [2947]]]


 [[[2674]
   [2674]
   [2674]
   ...
   [2898]
   [2947]
   [3044]]]


 ...


 [[[3259]
   [3409]
   [3082]
   ...
   [2951]
   [2905]
   [3069]]]


 [[[3409]
   [3082]
   [1943]
   ...
   [2905]
   [3069]
   [3309]]]


 [[[3082]
   [1943]
   [ 969]
   ...
   [3069]
   [3309]
   [3450]]]] [[[[2947]
   [3044]]]


 [[[3044]
   [3073]]]


 [[[3073]
   [3206]]]


 [[[3206]
   [3049]]]


 [[[3049]
   [1911]]]


 [[[1911]
   [ 998]]]


 [[[ 998]
   [1280]]]


 [[[1280]
   [2510]]]


 [[[2510]
   [3859]]]


 [[[3859]
   [3027]]]


 [[[3027]
   [2880]]]


 [[[2880]
   [2977]]]


 [[[2977]
   [2991]]]


 [[[2991]
   [3303]]]


 [[[3303]
   [3372]]]


 [[[3372]
   [3359]]]


 [[[3359]
   [2076]]]


 [[[2076]
   [1061]]]


 [[[1061]
   [1363]]]


 [[[1363]
   [2630]]]


 [[[2630]
   [3912]]]


 [[[3912]
   [3179]]]


 [[[3179]
   [3108]]]


 [[[3108]
   [32

InvalidArgumentError:  Incompatible shapes: [32,1,2,1] vs. [32,2]
	 [[node mean_squared_error/SquaredDifference (defined at tmp/ipykernel_3252/3569905992.py:2) ]] [Op:__inference_train_function_599360]

Function call stack:
train_function


In [312]:
from sklearn.metrics import mean_squared_error

def getPred(inp):
    pred = []
    for to_test_idx in range(len(inp)):
        x_input = inp[to_test_idx]
        x_input = x_input.reshape((1, 1, window, 1))
        yhat = model.predict(x_input, verbose=0)
        pred.append(yhat.tolist()[0])
    return pred

In [315]:
rmse = (mean_squared_error(train,getPred(train)) / timeframe) ** 0.5
sum_train = 0
len_train = 0
for i in train:
    for j in i:
        sum_train += j
        len_train += 1

average_bin_value_train = sum_train/len_train

print("RMSE ")
print("RMSE = " + str(int(rmse)))
print("Average demand = " + str(int(average_bin_value_train)))
print("Deviation = " + str(rmse/average_bin_value_train*10000//1/100) + "%")

704
RMSE 
RMSE = 251
Average demand = 3201
Deviation = 7.86%


In [314]:
import matplotlib.pyplot as plt
