In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
from preprocessing import *
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [2]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_{i,k}) and N'(t_{i,k}) depending on whether t above is t or t_prime
    """
    return np.searchsorted(t, t_scalar, side="right")

In [3]:
def m5_compensator_helper(t_scalar, t_prime, durations):
    """
    Compute the lists that are required for the compensator function but don't depend on the parameters
    """
    t_prime_difference = t_scalar - t_prime[:N(t_scalar, t_prime)]
    d_prime_1 = durations[:N(t_scalar, t_prime)] - 1
    d_prime_30 = np.absolute(durations[:N(t_scalar, t_prime)] - 30)

    return t_prime_difference, d_prime_1, d_prime_30

def m5_recursion_helper(t, t_prime, durations):
    """
    Compute the lists required for the recursive definitions of B_i and C_i
    """
    T = end_T
    time_differences = [] # Holds time differences t_ih - t'_ik
    d1_values = [] # Holds values d'_ik - 1
    d30_values = [] # Holds values |d'_ik - 30|

    for h in range(N(T, t)):
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list = t[h] - t_prime[N(t[h-1], t_prime):N(t[h], t_prime)]

        # Construct list of d_ik - 1 for k = 1 to N'(T)
        d1_values_list = durations[N(t[h-1], t_prime):N(t[h], t_prime)] - 1

        # Construct list of |d_ik - 30| for k = 1 to N'(T)
        d30_values_list = np.absolute(durations[N(t[h-1], t_prime):N(t[h], t_prime)] - 30)

        time_differences.append(differences_list)
        d1_values.append(d1_values_list)
        d30_values.append(d30_values_list)


    return time_differences, d1_values, d30_values

In [4]:
def compensator_m5(t_scalar, compensator_helpers, 
      lambda_i, alpha_i, beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime):
    """
    Evaluate compensator function for model 5.

    t_scalar: Scalar value at which we evaluate the compensator
    t_prime_precomputed: time differences (t_scalar - t_prime[:N(t_scalar, t_prime)]) array computed outside the function for efficiency
    durations: array of journey durations d'_{i,k} for each arrival time t'_{i,k} at station i
    The rest are parameters in model 5.   
    """
        
    t_prime_precomputed = compensator_helpers[0]
    d_prime_1 = compensator_helpers[1]
    d_prime_30 = compensator_helpers[2]


    term1 = lambda_i * t_scalar
    
    term2 = -(alpha_i / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_prime_precomputed) - beta_i * d_prime_1)-1)

    term3 = -(gamma_i / delta_i_prime) * np.sum(np.exp(-delta_i_prime * (t_prime_precomputed) - delta_i * d_prime_30)-1)

    return term1 + term2 + term3

In [5]:
def B_i(h, t, t_prime, durations, beta_i, beta_i_prime, recursion_helpers):

    """
    Returns a list of [B_i(1), ..., B_i(h)] for model 5

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """

    B = [np.sum(np.exp(-beta_i * (durations[:N(t[0], t_prime)] - 1)) * np.exp(-beta_i_prime*(t[0] - t_prime[:N(t[0], t_prime)])))]

    time_differences = recursion_helpers[0]
    d1_values = recursion_helpers[1]
    #d30_values = recursion_helpers[2]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for B_i(h)
        term1 = np.exp(-beta_i_prime*(t[l-1] - t[l-2])) * B[l-2]

        term2 = np.sum(np.exp(-beta_i * (d1_values[l-1])) * np.exp(-beta_i_prime*(time_differences[l-1])))
        B.append(term1 + term2)
    return np.array(B)
    

def C_i(h, t, t_prime, durations, delta_i, delta_i_prime, recursion_helpers):

    """
    Returns a list of [C_i(1), ..., C_i(h)] for model 5

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """

    C = [np.sum(np.exp(-delta_i * np.absolute(durations[:N(t[0], t_prime)] - 30)) * np.exp(-delta_i_prime*(t[0] - t_prime[:N(t[0], t_prime)])))]

    time_differences = recursion_helpers[0]
    #d1_values = recursion_helpers[1]
    d30_values = recursion_helpers[2]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for C_i(h)
        term1 = np.exp(-delta_i_prime*(t[l-1] - t[l-2])) * C[l-2]
        term2 = np.sum(np.exp(-delta_i * (d30_values[l-1])) * np.exp(-delta_i_prime*(time_differences[l-1])))
        C.append(term1 + term2)
    return np.array(C)

In [6]:
def m5_log_likelihood(t, t_prime, durations, compensator_helpers, recursion_helpers, lambda_i,
 alpha_i, beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime):
   
   T = end_T

   B_ = B_i(len(t), t, t_prime, durations, beta_i, beta_i_prime, recursion_helpers)

   C_ = C_i(len(t), t, t_prime, durations, delta_i, delta_i_prime, recursion_helpers)

   term1 = np.sum(np.log(lambda_i + alpha_i*B_ + gamma_i*C_))

   term2 = -compensator_m5(T, compensator_helpers, lambda_i, alpha_i,
      beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime)

   return term1 + term2

### TODO: Make sure the durations d_ik correspond exactly to the end times t'_ik. Do we need to get durations from train times sorted by end_time?

In [7]:
durations_per_end_time[1]

array([38.0811817 , 27.80359035, 11.58253296, ...,  6.93912279,
       20.85816307,  6.99643917])

In [8]:
# Test m5 likelihood function

t = t_per_station[2]
t_prime = t_prime_per_station[2]
durations = durations_per_end_time[2]
compensator_helpers = m5_compensator_helper(end_T, t_prime, durations)
recursion_helpers = m5_recursion_helper(t, t_prime, durations)

print(m5_log_likelihood(t, t_prime, durations,
    compensator_helpers, recursion_helpers, 0.1, 0.01, 0.1, 0.1, 0.1, 0.01, 0.011))

-68798.12740072113


In [9]:
condensed_log_l_m5 = lambda time_params, model_params: m5_log_likelihood(time_params[0],time_params[1], time_params[2], time_params[3], time_params[4], model_params[0], model_params[1], model_params[2], model_params[3], model_params[4], model_params[5],  model_params[6])

In [10]:
t_prime_per_station[1][1]

576.533889934963

In [11]:
compensator_helper_dict = {}
recursion_helper_dict = {}
for st_id in tqdm(train_sorted_start_ids):
    compensator_helper_dict[st_id] = m5_compensator_helper(end_T, t_prime_per_station[st_id], durations_per_end_time[st_id])
    recursion_helper_dict[st_id] = m5_recursion_helper(t_per_station[st_id], t_prime_per_station[st_id], durations_per_end_time[st_id])

 83%|████████▎ | 655/789 [03:29<00:42,  3.13it/s]


In [None]:
time_params_dict = {}
for st_id in tqdm(train_sorted_start_ids):
    time_params_dict[st_id] = [t_per_station[st_id], t_prime_per_station[st_id],
    durations_per_end_time[st_id], compensator_helper_dict[st_id], recursion_helper_dict[st_id]]


100%|██████████| 789/789 [00:00<00:00, 4892.94it/s]


In [49]:
time_params_dict[1] == time_params_dict[2]

  time_params_dict[1] == time_params_dict[2]


False

In [31]:
def transform_params(params):
    if len(params) != 7:
        raise ValueError("Not the right size.")
    lambda_i, alpha_i, beta_i,\
         beta_i_prime, gamma_i, \
            delta_i, delta_i_prime = params
    return [np.exp(lambda_i), np.exp(alpha_i), np.exp(beta_i),
     np.exp(alpha_i) + np.exp(beta_i_prime), np.exp(gamma_i),
     np.exp(delta_i), np.exp(gamma_i) + np.exp(delta_i_prime)]

In [55]:
unconverged_it1 = []
N_stat = 5
optimal_parameters_it1 = {}
for st_id in tqdm(train_sorted_start_ids[:N_stat]):
    # m2_station_params = model_2_params.loc[st_id].to_numpy()
    # m3_station_params = model_3_params.loc[st_id].to_numpy()
    #x0 = [np.log(m2_station_params[0]), np.log(m2_station_params[1] -m2_station_params[0] ), np.log(m3_station_params[0]), np.log(m3_station_params[1] - m3_station_params[0]), np.log(m3_station_params[-1])] # np.log(rates_dict[station.start_id.unique()[0]])]
    #x0 = np.random.random(7)/10
    x0 = np.ones(7) * -3
    op_m5_log_likelihood = lambda x: -condensed_log_l_m5(time_params_dict[st_id], transform_params(x))
    sol = op.minimize(op_m5_log_likelihood, x0, method="Nelder-Mead")
    if sol.success:
        max_params = transform_params(sol.x)
        optimal_parameters_it1[st_id] = max_params
    else:
        unconverged_it1.append(st_id)
        print(f"{st_id} failed to converge.")

100%|██████████| 5/5 [00:11<00:00,  2.28s/it]


In [56]:
optimal_parameters_it1

{1: [1.0269163979862075,
  1.0562799353686771,
  1.0835429989768606,
  2.083247666634845,
  1.0100739024378216,
  1.102897288580086,
  2.0996888273173586],
 2: [1.0588489388722486,
  1.049501037173833,
  1.044553855309569,
  2.093647158839925,
  1.0239268710327096,
  1.0652065505526351,
  2.044006708564674],
 3: [1.013036669370214,
  1.051750260916667,
  1.0354996100826204,
  2.0835361081552968,
  1.0825873921577969,
  1.0597812660114292,
  2.0840766231240493],
 4: [1.0743135067015799,
  1.076402258163898,
  1.104850597413892,
  2.128028792863896,
  1.0812959283278782,
  1.0557590477962926,
  2.157664866945902],
 5: [1.004820024120488,
  1.0714430070111545,
  1.03468316552485,
  2.1724245957579367,
  1.075731933792216,
  1.0211110013769884,
  2.0950222356931167]}

In [37]:
m5_param_df = pd.DataFrame(optimal_parameters_it1.values(), index=optimal_parameters_it1.keys(), columns = ["lambda", "alpha", "beta", "beta_i_prime", "gamma_i", "delta_i", "delta_i_prime"])
m5_param_df.head() 

Unnamed: 0,lambda_i,alpha_i,beta_i,beta_i_prime,gamma_i,delta_i,delta_i_prime
1,0.049787,0.049787,0.049787,0.099574,0.049787,0.049787,0.099574
2,0.049787,0.049787,0.049787,0.099574,0.049787,0.049787,0.099574
3,0.049787,0.049787,0.049787,0.099574,0.049787,0.049787,0.099574
4,0.049787,0.049787,0.049787,0.099574,0.049787,0.049787,0.099574
5,0.049787,0.049787,0.049787,0.099574,0.049787,0.049787,0.099574
