In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
from preprocessing import *
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [14]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_{i,k}) and N'(t_{i,k}) depending on whether t above is t or t_prime
    """
    return np.searchsorted(t, t_scalar, side="right")

def get_time_differences(t, t_prime):
    """
    Input: (sorted) times for a particular station i
    Output: List of differences indexed by [h][k] for this station i
    """

    T = end_T
    differences = []
    for h in range(N(T, t)):
        differences_list = []
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list.append(t[h] - t_prime[N(t[h-1], t_prime):N(t[h], t_prime)])

        differences.append(np.array(differences_list))

    return differences

def get_duration_1_values():
    """
    This function returns the pre-computed (durations-1) lists for B_i and C_i
    """

    raise NotImplementedError

def get_duration_30_values():
    """
    This function returns the pre-computed |durations-30| lists for B_i and C_i
    """

    raise NotImplementedError

get_time_differences(np.array([1,2,3]),np.array([1.1,2.2,3.3]))

[array([], shape=(1, 0), dtype=float64), array([[0.9]]), array([[0.8]])]

# TODO: Make sure the durations d_ik correspond exactly to the end times t'_ik. Do we need to get durations from train times sorted by end_time?

In [3]:
durations_dict = {}
for st_id in train_sorted_start_ids:
    station_train_data = train_bike_data[train_bike_data.end_id == st_id]
    durations_dict[st_id] = station_train_data.duration.to_numpy()

In [16]:
def m5_compensator_helper(t_scalar, t_prime, durations):
    """
    Compute the list that is required for the compensator function but don't depend on the parameters

    Returns t_prime_precomputed
    """
    t_prime_difference = t_scalar - t_prime[:N(t_scalar, t_prime)]
    d_prime_1 = durations[:N(t_scalar, t_prime)] - 1
    d_prime_30 = np.absolute(durations[:N(t_scalar, t_prime)] - 30)

    return t_prime_difference, d_prime_1, d_prime_30

In [10]:
def compensator_m5(t_scalar, t_prime_precomputed, d_prime_1, d_prime_30, lambda_i, alpha_i,
 beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime):
  """
  Evaluate compensator function for model 5.

  t_scalar: Scalar value at which we evaluate the compensator
  t_prime_precomputed: time differences (t_scalar - t_prime[:N(t_scalar, t_prime)]) array computed outside the function for efficiency
  durations: array of journey durations d'_{i,k} for each arrival time t'_{i,k} at station i
  The rest are parameters in model 5.   
  """

  term1 = lambda_i * t_scalar
  
  term2 = -(alpha_i / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_prime_precomputed) - beta_i * d_prime_1)-1)

  term3 = -(gamma_i / delta_i_prime) * np.sum(np.exp(-delta_i_prime * (t_prime_precomputed) - delta_i * d_prime_30)-1)

  return term1 + term2 + term3

In [11]:
def B_i(h, t, t_prime, durations, beta_i, beta_i_prime, time_differences):

    """
    Returns a list of [B_i(1), ..., B_i(h)] for model 5

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """

    # TODO: Compute durations[N(t[h-2], t_prime):N(t[h-1], t_prime)] ahead of time like time_differences
    # Note: we need durations at the exact same indeces as t_prime here

    B = [np.sum(np.exp(-beta_i * (durations[:N(t[0], t_prime)] - 1)) * np.exp(-beta_i_prime*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for B_i(h)
        term1 = np.exp(-beta_i_prime*(t[l-1] - t[l-2])) * B[l-2]
        term2 = np.sum(np.exp(-beta_i * (durations[N(t[h-2], t_prime):N(t[h-1], t_prime)] - 1)) * np.exp(-beta_i_prime*(time_differences[l-1])))
        B.append(term1 + term2)
    return np.array(B)
    

def C_i(h, t, t_prime, durations, delta_i, delta_i_prime, time_differences):

    """
    Returns a list of [C_i(1), ..., C_i(h)] for model 5

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """

    # TODO: Compute durations[N(t[h-2], t_prime):N(t[h-1], t_prime)] ahead of time like time_differences
    # Note: we need durations at the exact same indeces as t_prime here

    C = [np.sum(np.exp(-delta_i * (durations[:N(t[0], t_prime)] - 1)) * np.exp(-delta_i_prime*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for C_i(h)
        term1 = np.exp(-delta_i_prime*(t[l-1] - t[l-2])) * C[l-2]
        term2 = np.sum(np.exp(-delta_i * (durations[N(t[h-2], t_prime):N(t[h-1], t_prime)] - 1)) * np.exp(-delta_i_prime*(time_differences[l-1])))
        C.append(term1 + term2)
    return np.array(C)

In [22]:
def m5_log_likelihood(t, t_prime, durations, t_prime_precomputed, d_prime_1, d_prime_30, lambda_i,
 alpha_i, beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime, time_differences):
   
   T = end_T

   B_ = B_i(len(t), t, t_prime, durations, beta_i, beta_i_prime, time_differences)

   C_ = C_i(len(t), t, t_prime, durations, delta_i, delta_i_prime, time_differences)

   term1 = np.sum(np.log(lambda_i + alpha_i*B_[:len(t)+1] + delta_i*C_[:len(t)+1]))

   term2 = -compensator_m5(T, t_prime_precomputed, d_prime_1, d_prime_30, lambda_i, alpha_i,
      beta_i, beta_i_prime, gamma_i, delta_i, delta_i_prime)

   return term1 + term2

In [23]:
# Test m5 likelihood function

t = t_per_station[1]
t_prime = t_prime_per_station[1]
durations = durations_dict[1]

time_differences = get_time_differences(t, t_prime)
t_prime_precomputed, d_prime_1, d_prime_30 = m5_compensator_helper(end_T, t_prime, durations)

print(m5_log_likelihood(t, t_prime, durations,
    t_prime_precomputed, d_prime_1, d_prime_30, 0.1, 0.01, 0.1, 0.1, 0.1, 0.01, 0.01, time_differences))

#print(compensator_m4(end_T, t, t_prime, 0.2, 0.001, 0.1, 0.01, 0.1))

-29521.46168932957
