In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
from preprocessing import *
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [9]:
station_data = pd.read_csv("../data/santander_locations.csv")


class OptimizationError(RuntimeError):
    """Called when optimizer does not converge."""
    pass

class StationIdError(IndexError):
    """Called when we try and read a non-existing station id."""
    pass


def get_station_name(in_id):
    """Get station name from bike_data for a given id."""
    try:
        return station_data[
            station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")


bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60
bike_data["start_time"] = bike_data["start_time"] \
    + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] \
    + np.random.rand(*bike_data["end_time"].shape)
bike_data["duration"] = bike_data.end_time - bike_data.start_time
bike_data = bike_data.sort_values(by=["start_time"])

train_time = 12*7*24*60
train_bike_data = bike_data[bike_data.start_time <= train_time]
test_bike_data = bike_data[bike_data.start_time > train_time]
train_sorted_stations_start = []
for st_id in train_bike_data.start_id.sort_values().unique():
    train_sorted_stations_start.append(
        train_bike_data[train_bike_data.start_id == st_id]
        )
test_sorted_stations = []
for st_id in test_bike_data.start_id.sort_values().unique():
    test_sorted_stations.append(
        test_bike_data[test_bike_data.start_id == st_id]
        )
rates_dict = {}
for station in test_sorted_stations:
    time_elapsed = station.start_time.to_numpy()[-1] \
        - station.start_time.to_numpy()[0]
    n_events = test_sorted_stations[0].size
    rate = n_events / time_elapsed

    rates_dict[station.start_id.unique()[0]] = rate
station_array = list(rates_dict.keys())


def ecdf(data):
    # https://cmdlinetips.com/2019/05/empirical-cumulative-distribution-function-ecdf-in-python/
    """ Compute ECDF """
    x = np.sort(data)
    n = x.size
    y = np.arange(1, n+1) / n
    return(x, y)


tprime_per_station = {}
for id in bike_data.end_id.unique():
    unsorted_station_end_time = bike_data[bike_data.end_id == id]
    sorted_station_end_time = unsorted_station_end_time.sort_values(
        by=["end_time"])
    tprime_per_station[id] = sorted_station_end_time.\
        end_time.to_numpy()
tprime_per_station

t_per_station = {}
for id in bike_data.start_id.unique():
    unsorted_station_start_time = bike_data[bike_data.start_id == id]
    sorted_station_start_time = unsorted_station_start_time.sort_values(
        by=["start_time"])
    t_per_station[id] = sorted_station_start_time.\
        start_time.to_numpy()

sorted_start_ids = np.sort(bike_data.start_id.unique())

MemoryError: Unable to allocate 30.2 MiB for an array with shape (3961103,) and data type int64

In [4]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_{i,k}) and N'(t_{i,k}) depending on whether t above is t or t_prime
    """

    return np.searchsorted(t, t_scalar, side="right")

def getTimeDifferences(t, t_prime):
    """
    Input: (sorted) times for a particular station i
    Output: List of differences indexed by [h][k] for this station i
    """

    # h goes until N(t[-1], t) assuming T = t[-1]
    T = t[-1]
    D_result = []
    for h in range(1, N(T, t)+1):
        differences_list = []
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list.append(t[h-1] - t_prime[N(t[h-2], t_prime):N(t[h-1], t_prime)])

        D_result.append(np.array(differences_list))

    return D_result

getTimeDifferences(np.array([1,2,3]),np.array([1.1,2.2,3.3]))

[array([], shape=(1, 0), dtype=float64), array([[0.9]]), array([[0.8]])]

In [5]:
def compensator_m4(t_scalar, t, t_prime, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime):
    """
    t_scalar: scalar value where Lambda_i(t) is to be evaluated
    t_prime: list of arrival times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE.
    """

    term1 = lambda_i * t_scalar
    term2 = -(alpha_i_prime / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_scalar - t_prime[:N(t_scalar, t_prime)]))-1)
    term3 = -(alpha_i / beta_i) * np.sum(np.exp(-beta_i * (t_scalar - t[:N(t_scalar, t)]))-1)
    return term1 + term2 + term3


In [6]:
def new_B(h, t, t_prime, beta, time_differences):

    """
    Returns a list of [B_i(1), ..., B_i(h)]

    NOTE: t_prime NEEDS to be sorted here

    time_differences: time differences double list for station i

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """
    B = [np.sum(np.exp(-beta*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for B_i(h)
        term1 = np.exp(-beta*(t[l-1] - t[l-2])) * B[l-2]
        term2 = np.sum(np.exp(-beta*(time_differences[l-1])))
        B.append(term1 + term2)
    return np.array(B)


def new_A(h, t, beta):

    A = []
    for i in range(1, h+1):
        if i==1:
            A.append(0)
        else:
            A.append(np.exp(-1*beta*(t[i-1] - t[i-2]))*(1+A[i-2]))
    return np.array(A)


def m4_log_likelihood(t, t_prime, alpha_i, beta_i, alpha_i_prime, beta_i_prime, lambda_i, time_differences):
    """
    Gives log likelihood of our five parameters. 
    t: start times from station i
    t_prime: end times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE
    """
    
    T = t[-1] # TODO: Is this how we get big T?

    # Get A list
    A_ = new_A(len(t), t, beta_i)

    # Get B list 
    B_ = new_B(len(t), t, t_prime, beta_i_prime, time_differences)

    term1 = np.sum(np.log(lambda_i + alpha_i_prime*B_[:len(t)+1] + alpha_i*A_[:len(t)+1]))

    term2 = -compensator_m4(T, t, t_prime, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime)

    return term1 + term2


In [7]:
# Test m4 likelihood function
beta = 0.01

t = t_per_station[1]
t_prime = tprime_per_station[1]

time_differences = getTimeDifferences(t, t_prime)

m4_log_likelihood(t, t_prime, 0.01, 0.1, 0.01, 0.1, 0.1, time_differences)

NameError: name 't_per_station' is not defined

## Finding the parameters using likelihood optimisation

In [74]:
time_diffs = {}
for st_id in sorted_start_ids:
    t = t_per_station[st_id]
    t_prime = tprime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_diffs[st_id] = getTimeDifferences(t, t_prime)

In [78]:
optimal_parameters = {}
for st_id in sorted_start_ids:
    print(st_id)
    x0 = [np.log(0.01), np.log(0.1), np.log(0.01)] # np.log(rates_dict[station.start_id.unique()[0]])]

    # TODO: What bounds should we use here?
    t = t_per_station[st_id]
    t_prime = tprime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_differences = time_diffs[st_id]

    op_m3_likelihood = lambda x: -new_m3_log_likelihood(t, t_prime, np.exp(x[0]), np.exp(x[0]) + np.exp(x[1]), np.exp(x[2]), time_differences)
    sol = op.minimize(op_m3_likelihood, x0, method="Nelder-Mead")

    #sol = op.minimize(op_m3_likelihood, x0, method="SLSQP")
    if sol.success:
        transformed_alpha = np.exp(sol.x[0])
        transformed_beta = np.exp(sol.x[1]) + np.exp(sol.x[0])
        transformed_lambda = np.exp(sol.x[2])
        max_params = [transformed_alpha, transformed_beta, transformed_lambda]
        optimal_parameters[st_id] = max_params

    else:
        raise OptimizationError(f"Failed to converge for station {station}.")
optimal_parameters

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
22
23
24
25
26
27
28
29
30
31
32
33
34
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
60
61
62
63
64
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
138
139
140
141
142
143
144
145
146
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
199
200
201
202
203
204
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
286
287


{1: [0.019122731860246187, 0.03614626653732669, 0.008170079483272117],
 2: [0.015538489022068145, 0.0165694584293426, 0.0016844615174440344],
 3: [0.004572120211174977, 0.0047259007526767325, 5.7267746575482675e-05],
 4: [0.009156399940533801, 0.021401839879086983, 0.00832705116682503],
 5: [0.01227954775932842, 0.013475410850815, 0.0014276568626566523],
 6: [0.011139101787962293, 0.011519333462536158, 0.0003793172886436994],
 7: [0.016509964101323046, 0.028467954068496067, 0.010451962706451316],
 8: [0.021685862099769516, 0.045135035724945996, 0.02841624128380775],
 9: [0.015441620195876684, 0.017397000114081657, 0.002774592924765038],
 10: [0.009545558455112109, 0.010595598566457283, 0.0021191054677752014],
 11: [0.014021180958530447, 0.014412828730080537, 0.0011880428531101765],
 12: [0.009763595910069336, 0.009763595910080261, 0.0010841186959541687],
 13: [0.00804283998518393, 0.009205924264708589, 0.0021676588881792453],
 14: [0.011211174260123333, 0.05324121076277802, 0.059155882

In [80]:
# param_df = pd.DataFrame(optimal_parameters.values(), index=optimal_parameters.keys(), columns = ["alpha", "beta", "lambda"])
# param_df.head()
# param_df.to_csv("../data/M3_params.csv")

In [276]:
optimal_parameters = {}
for station in train_sorted_stations_start:
    print(station.start_id.to_numpy()[0])
    x0 = [0.1, 1, 0.1] # np.log(rates_dict[station.start_id.unique()[0]])]

    t = station.start_time.to_numpy()
    t_prime = np.sort(station.end_time.to_numpy()) # Need to sort t_prime for likelihood function
    time_differences = getTimeDifferences(t, t_prime)

    op_m3_likelihood = lambda x: -new_m3_log_likelihood(t, t_prime, x[0], x[1], x[2], time_differences)
    bounds = ((0.0000001, 10), (0.0000001, 10), (0.0000001, 10))
    #sol = op.minimize(op_m3_likelihood, x0, method="Nelder-Mead", bounds=bounds)
    sol = op.minimize(op_m3_likelihood, x0, method="Nelder-Mead", bounds=bounds)
    if sol.success:
        max_params = sol.x
        optimal_parameters[station.start_id.unique()[0]] = max_params

    else:
        raise OptimizationError(f"Failed to converge.")
optimal_parameters

1
2
3


KeyboardInterrupt: 

## Assessing fit for model 3