In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
from preprocessing import *
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [16]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_{i,k}) and N'(t_{i,k}) depending on whether t above is t or t_prime
    """
    return np.searchsorted(t, t_scalar, side="right")

def get_time_differences(t, t_prime):
    """
    Input: (sorted) times for a particular station i
    Output: List of differences indexed by [h][k] for this station i
    """

    T = end_T
    differences = []
    for h in range(N(T, t)):
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list = t[h] - t_prime[N(t[h-1], t_prime):N(t[h], t_prime)]

        differences.append(differences_list)

    return differences

def get_duration_1_values(t, t_prime, durations):
    """
    This function returns the pre-computed (durations-1) lists for B_i and C_i
    """

    T = end_T
    values = []
    for h in range(N(T, t)):
        values_list = []
        # Construct list of d_ik - 1 for k = 1 to N'(T)
        values_list.append(durations[N(t[h-1], t_prime):N(t[h], t_prime)] - 1)

        values.append(np.array(values_list))

    return values

def get_duration_30_values():
    """
    This function returns the pre-computed |durations-30| lists for B_i and C_i
    """

    T = end_T
    values = []
    for h in range(N(T, t)):
        values_list = []
        # Construct list of |d_ik - 30| for k = 1 to N'(T)
        values_list.append(durations[N(t[h-1], t_prime):N(t[h], t_prime)] - 30)

        values.append(np.absolute(np.array(values_list)))

    return values


get_time_differences(np.array([1,2,3]),np.array([1.1,2.2,3.3]))

[array([], dtype=float64), array([0.9]), array([0.8])]

# TODO: Make sure the durations d_ik correspond exactly to the end times t'_ik. Do we need to get durations from train times sorted by end_time?

In [17]:
durations_dict = {}
for st_id in train_sorted_start_ids:
    station_train_data = train_bike_data[train_bike_data.end_id == st_id]
    durations_dict[st_id] = station_train_data.duration.to_numpy()

In [18]:
def m5_compensator_helper(t_scalar, t_prime, durations):
    """
    Compute the lists that are required for the compensator function but don't depend on the parameters
    """
    t_prime_difference = t_scalar - t_prime[:N(t_scalar, t_prime)]
    d_prime_1 = durations[:N(t_scalar, t_prime)] - 1
    d_prime_30 = np.absolute(durations[:N(t_scalar, t_prime)] - 30)

    return t_prime_difference, d_prime_1, d_prime_30

def m5_recursion_helper(t, t_prime, durations):
    """
    Compute the lists required for the recursive definitions of B_i and C_i
    """
    T = end_T
    time_differences = [] # Holds time differences t_ih - t'_ik
    d1_values = [] # Holds values d'_ik - 1
    d30_values = [] # Holds values |d'_ik - 30|

    for h in range(N(T, t)):
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list = t[h] - t_prime[N(t[h-1], t_prime):N(t[h], t_prime)]

        # Construct list of d_ik - 1 for k = 1 to N'(T)
        d1_values_list = durations[N(t[h-1], t_prime):N(t[h], t_prime)] - 1

        # Construct list of |d_ik - 30| for k = 1 to N'(T)
        d30_values_list = np.absolute(durations[N(t[h-1], t_prime):N(t[h], t_prime)] - 30)

        time_differences.append(differences_list)
        d1_values.append(d1_values_list)
        d30_values.append(d30_values_list)


    return time_differences, d1_values, d30_values

In [19]:
def compensator_m5(t_scalar, t_prime_precomputed, d_prime_1, d_prime_30, lambda_i, alpha_i,
 beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime):
  """
  Evaluate compensator function for model 5.

  t_scalar: Scalar value at which we evaluate the compensator
  t_prime_precomputed: time differences (t_scalar - t_prime[:N(t_scalar, t_prime)]) array computed outside the function for efficiency
  durations: array of journey durations d'_{i,k} for each arrival time t'_{i,k} at station i
  The rest are parameters in model 5.   
  """

  term1 = lambda_i * t_scalar
  
  term2 = -(alpha_i / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_prime_precomputed) - beta_i * d_prime_1)-1)

  term3 = -(gamma_i / delta_i_prime) * np.sum(np.exp(-delta_i_prime * (t_prime_precomputed) - delta_i * d_prime_30)-1)

  return term1 + term2 + term3

In [20]:
def B_i(h, t, t_prime, time_differences, durations, beta_i, beta_i_prime):

    """
    Returns a list of [B_i(1), ..., B_i(h)] for model 5

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """

    # TODO: Compute durations[N(t[h-2], t_prime):N(t[h-1], t_prime)] ahead of time like time_differences
    # Note: we need durations at the exact same indeces as t_prime here

    B = [np.sum(np.exp(-beta_i * (durations[:N(t[0], t_prime)] - 1)) * np.exp(-beta_i_prime*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for B_i(h)
        term1 = np.exp(-beta_i_prime*(t[l-1] - t[l-2])) * B[l-2]
        term2 = np.sum(np.exp(-beta_i * (durations[N(t[l-2], t_prime):N(t[l-1], t_prime)] - 1)) * np.exp(-beta_i_prime*(time_differences[l-1])))
        B.append(term1 + term2)
    return np.array(B)
    

def C_i(h, t, t_prime, time_differences,durations, delta_i, delta_i_prime):

    """
    Returns a list of [C_i(1), ..., C_i(h)] for model 5

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """

    # TODO: Compute durations[N(t[h-2], t_prime):N(t[h-1], t_prime)] ahead of time like time_differences
    # Note: we need durations at the exact same indeces as t_prime here

    C = [np.sum(np.exp(-delta_i * np.absolute(durations[:N(t[0], t_prime)] - 30)) * np.exp(-delta_i_prime*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for C_i(h)
        term1 = np.exp(-delta_i_prime*(t[l-1] - t[l-2])) * C[l-2]
        term2 = np.sum(np.exp(-delta_i * np.absolute(durations[N(t[l-2], t_prime):N(t[l-1], t_prime)] - 30)) * np.exp(-delta_i_prime*(time_differences[l-1])))
        C.append(term1 + term2)
    return np.array(C)

In [25]:
def m5_log_likelihood(station_params, lambda_i, alpha_i, beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime):
   t, t_prime, durations, t_prime_precomputed, time_differences, d_prime_1, d_prime_30 = station_params
   T = end_T

   B_ = B_i(len(t), t, t_prime, time_differences, durations, beta_i, beta_i_prime)

   C_ = C_i(len(t), t, t_prime, time_differences, durations, delta_i, delta_i_prime)

   term1 = np.sum(np.log(lambda_i + alpha_i*B_ + delta_i*C_))

   term2 = -compensator_m5(T, t_prime_precomputed, d_prime_1, d_prime_30, lambda_i, alpha_i,
      beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime)

   return term1 + term2

In [27]:
# Test m5 likelihood function

t = t_per_station[1]
t_prime = t_prime_per_station[1]
durations = durations_dict[1]

time_differences = get_time_differences(t, t_prime)
t_prime_precomputed, d_prime_1, d_prime_30 = m5_compensator_helper(end_T, t_prime, durations)

print(m5_log_likelihood([t, t_prime, durations,
    t_prime_precomputed,time_differences, d_prime_1, d_prime_30], 0.1, 0.01, 0.1, 0.1, 0.1, 0.01, 0.01))

#print(compensator_m4(end_T, t, t_prime, 0.2, 0.001, 0.1, 0.01, 0.1))

-29522.895110066085


-28288.312867114553

In [28]:
time_diffs = {}
for st_id in tqdm(train_sorted_start_ids):
    time_diffs[st_id] = get_time_differences(t_per_station[st_id], t_prime_per_station[st_id])

100%|██████████| 789/789 [00:13<00:00, 58.39it/s]


In [29]:
t_prime_precomputed_dict = {}
d_prime_1_dict = {}
d_prime_30_dict = {}
for st_id in tqdm(train_sorted_start_ids):
    t = t_per_station[st_id]
    t_prime = t_prime_per_station[st_id]
    t_prime_precomputed_dict[st_id] = m5_compensator_helper(end_T, t, t_prime)[0]
    d_prime_1_dict[st_id] = m5_compensator_helper(end_T, t, t_prime)[1]
    d_prime_30_dict[st_id] = m5_compensator_helper(end_T, t, t_prime)[2]

100%|██████████| 789/789 [00:00<00:00, 15603.90it/s]


In [30]:
# combine to new dict called station parameters: ordered as t, trunc_t, tprime, trunc_tprime, time_diffs
#m5_log_likelihood(t, t_prime, durations, t_prime_precomputed, time_differences, d_prime_1, d_prime_30
station_time_and_distance_parameters = {}
for st_id in tqdm(train_sorted_start_ids):
    station_time_and_distance_parameters[st_id] = [t_per_station[st_id], t_prime_per_station[st_id],
     t_prime_precomputed_dict[st_id], time_diffs[st_id], d_prime_1_dict[st_id], d_prime_30_dict[st_id]]

100%|██████████| 789/789 [00:00<00:00, 65423.28it/s]


## Likelihood optimisation

In [31]:
model_2_params = pd.read_csv("../data/N_M_params.csv", index_col=0)
model_3_params = pd.read_csv("../data/M3_train_params.csv", index_col=0)
model_3_params

Unnamed: 0,alpha,beta,lambda
1,0.020179,0.035553,7.705058e-03
2,0.015271,0.016036,1.213006e-03
3,0.004644,0.004644,3.779973e-17
4,0.009820,0.023543,9.676464e-03
5,0.011876,0.012869,1.258996e-03
...,...,...,...
836,0.014502,0.039605,1.707352e-02
838,0.006228,0.006470,4.663218e-04
839,0.023140,0.028994,8.233945e-03
840,0.018468,0.019006,2.030979e-04


In [None]:
N_stat = len(train_sorted_start_ids)+1
optimal_parameters = {}
for st_id in tqdm(train_sorted_start_ids[:N_stat]):
    # m2_station_params = model_2_params.loc[st_id].to_numpy()
    # m3_station_params = model_3_params.loc[st_id].to_numpy()
    #x0 = [np.log(m2_station_params[0]), np.log(m2_station_params[1] -m2_station_params[0] ), np.log(m3_station_params[0]), np.log(m3_station_params[1] - m3_station_params[0]), np.log(m3_station_params[-1])] # np.log(rates_dict[station.start_id.unique()[0]])]
    x0 = -np.ones(5) * 3
    op_m5_log_likelihood = lambda x: -m5_log_likelihood(station_time_and_distance_parameters[st_id], np.exp(x[0]), np.exp(x[1]), np.exp(x[2]), np.exp(x[1]) + np.exp(x[3]), np.exp(x[2]) + np.exp(x[4]))
    sol = op.minimize(op_m5_log_likelihood, x0, method="Nelder-Mead")
    if sol.success:
        transformed_lambda = np.exp(sol.x[0])
        transformed_alpha = np.exp(sol.x[1])
        transformed_alpha_prime = np.exp(sol.x[2])
        transformed_beta = np.exp(sol.x[1]) + np.exp(sol.x[3])
        transformed_beta_prime = np.exp(sol.x[2]) + np.exp(sol.x[4])
        max_params = [transformed_lambda, transformed_alpha, transformed_alpha_prime, transformed_beta, transformed_beta_prime]
        optimal_parameters[st_id] = max_params

    else:
        raise OptimizationError(f"Failed to converge for station {st_id}.")