In [379]:
import pandas as pd
# Convert the data to (0) per-two-hour (1) per-hour, (2) per-30-minute, (3) per-15-minute bin
# ONLY on pickup
d = pd.read_csv('trips_simpler.csv')
d['pickup_datetime'] = pd.to_datetime(d['pickup_datetime'])
d = d.set_index(['pickup_datetime'])
d = d['n']
# 2-hour bin
d2h = d.resample('2H').count()

# 1-hour bin
d1h = d.resample('1H').count()

# 30-min bin
d30m = d.resample('30T').count()

# 15-min bin
d15m = d.resample('15T').count()

In [380]:
d2h.head(12)

pickup_datetime
2019-07-01 00:00:00    2674
2019-07-01 02:00:00    1522
2019-07-01 04:00:00    1524
2019-07-01 06:00:00    2861
2019-07-01 08:00:00    3840
2019-07-01 10:00:00    2911
2019-07-01 12:00:00    2898
2019-07-01 14:00:00    2947
2019-07-01 16:00:00    3044
2019-07-01 18:00:00    3073
2019-07-01 20:00:00    3206
2019-07-01 22:00:00    3049
Freq: 2H, Name: n, dtype: int64

In [381]:
def unnegate(num):
    if num < 0:
        num = 0
    return num

# reshape from [samples, timesteps] into [samples, subsequences, timesteps, features]
def reshapeData(dat):
    dat = array(dat)
    dat = dat.reshape((dat.shape[0], 1, dat.shape[1], 1))
    print(dat.shape)
    return dat

def training_builder(dset, window, step, train_ratio): # e.g. window = 5, step = 2: [1 2 3 4 5] -> 6 7
    d = []
    idx = -window + 1
    for i in range(len(dset)):
        dj = []
        dk = []
        for j in range(window):
            dj.append(dset[unnegate(idx + j)])
        for k in range(step):
            try:
                dk.append(dset[unnegate(idx + window + k)])
            except:
                break
        d.append([dj,dk])
        idx += 1
        result = pd.DataFrame(d[window//2:-step], columns=['inp','outp'])
        inp = result['inp'].tolist()
        outp = result['outp'].tolist()
    inp_train = reshapeData(inp[:int(len(inp)*train_ratio)])
    out_train = outp[:int(len(outp)*train_ratio)]
    inp_test = reshapeData(inp[int(len(inp)*train_ratio):])
    out_test = outp[int(len(outp)*train_ratio):]
    return inp_train, out_train, inp_test, out_test
    
l2h = d2h.tolist(), 12
l1h = d1h.tolist(), 24
l30m = d30m.tolist(), 48
l15m = d15m.tolist(), 96

In [396]:
data_in_foucus = l15m
window = data_in_foucus[1]
timeframe = 2
train_ratio = 0.8
inp_train, out_train, inp_test, out_test = training_builder(data_in_foucus[0], window, timeframe, train_ratio)

(2264, 1, 96, 1)
(566, 1, 96, 1)


In [397]:
# ML part
from numpy import array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

In [398]:
model = Sequential()
model.add(TimeDistributed(Conv1D(filters=128, kernel_size=5, activation='relu'), input_shape=(None, window, 1)))
model.add(TimeDistributed(MaxPooling1D(pool_size=4)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(200, activation='relu'))
model.add(Dense(2))
model.compile(optimizer='adam', loss='mse')

model.summary()

Model: "sequential_46"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_130 (TimeDi (None, None, 92, 128)     768       
_________________________________________________________________
time_distributed_131 (TimeDi (None, None, 23, 128)     0         
_________________________________________________________________
time_distributed_132 (TimeDi (None, None, 2944)        0         
_________________________________________________________________
lstm_41 (LSTM)               (None, 200)               2516000   
_________________________________________________________________
dense_41 (Dense)             (None, 2)                 402       
Total params: 2,517,170
Trainable params: 2,517,170
Non-trainable params: 0
_________________________________________________________________


In [399]:
# Train model
model.fit(inp_train, array(out_train), epochs=750, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f9e096604c0>

In [400]:
from sklearn.metrics import mean_squared_error

def getPred(inp):
    pred = []
    for to_test_idx in range(len(inp)):
        x_input = inp[to_test_idx]
        x_input = x_input.reshape((1, 1, window, 1))
        yhat = model.predict(x_input, verbose=0)
        pred.append(yhat.tolist()[0])
    return pred

In [401]:
def predict(inp, outp, label=""):
    rmse = (mean_squared_error(getPred(inp), outp) / timeframe) ** 0.5
    sum_dat = 0
    len_dat = 0
    for i in inp:
        for j in i:
            sum_dat += j[0]
            len_dat += 1
    average_bin_value = sum_dat/len_dat
    in_percent = rmse/average_bin_value*10000//1/100

    print("RMSE " + label + " = " + str(int(rmse)))
    print("Average demand " + label + " = " + str(int(average_bin_value)))
    print("Deviation " + label + " = " + str(in_percent) + "%")

predict(inp_train, out_train, "train")
predict(inp_test, out_test, "test")

RMSE train = 4
Average demand train = 396
Deviation train = [1.03]%
RMSE test = 33
Average demand test = 419
Deviation test = [7.95]%


In [402]:
import matplotlib.pyplot as plt
