In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
from preprocessing import *
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [2]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_{i,k}) and N'(t_{i,k}) depending on whether t above is t or t_prime
    """

    return np.searchsorted(t, t_scalar, side="right")

def getTimeDifferences(t, t_prime):
    """
    Input: (sorted) times for a particular station i
    Output: List of differences indexed by [h][k] for this station i
    """

    # h goes until N(t[-1], t) assuming T = t[-1]
    T = t[-1]
    D_result = []
    for h in range(1, N(T, t)+1):
        differences_list = []
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list.append(t[h-1] - t_prime[N(t[h-2], t_prime):N(t[h-1], t_prime)])

        D_result.append(np.array(differences_list))

    return D_result

getTimeDifferences(np.array([1,2,3]),np.array([1.1,2.2,3.3]))

[array([], shape=(1, 0), dtype=float64), array([[0.9]]), array([[0.8]])]

In [3]:
def compensator_m4(t_scalar, t, t_prime, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime):
    """
    t_scalar: scalar value where Lambda_i(t) is to be evaluated
    t_prime: list of arrival times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE.
    """

    term1 = lambda_i * t_scalar
    term2 = -(alpha_i_prime / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_scalar - t_prime[:N(t_scalar, t_prime)]))-1)
    term3 = -(alpha_i / beta_i) * np.sum(np.exp(-beta_i * (t_scalar - t[:N(t_scalar, t)]))-1)
    return term1 + term2 + term3


In [4]:
def new_B(h, t, t_prime, beta, time_differences):

    """
    Returns a list of [B_i(1), ..., B_i(h)]

    NOTE: t_prime NEEDS to be sorted here

    time_differences: time differences double list for station i

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """
    B = [np.sum(np.exp(-beta*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for B_i(h)
        term1 = np.exp(-beta*(t[l-1] - t[l-2])) * B[l-2]
        term2 = np.sum(np.exp(-beta*(time_differences[l-1])))
        B.append(term1 + term2)
    return np.array(B)


def new_A(h, t, beta):

    A = []
    for i in range(1, h+1):
        if i==1:
            A.append(0)
        else:
            A.append(np.exp(-1*beta*(t[i-1] - t[i-2]))*(1+A[i-2]))
    return np.array(A)


def m4_log_likelihood(t, t_prime, time_differences, alpha_i, beta_i, alpha_i_prime, beta_i_prime, lambda_i):
    """
    Gives log likelihood of our five parameters. 
    t: start times from station i
    t_prime: end times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE
    """
    
    T = end_T # TODO: Is this how we get big T?

    # Get A list
    A_ = new_A(len(t), t, beta_i)

    # Get B list 
    B_ = new_B(len(t), t, t_prime, beta_i_prime, time_differences)

    term1 = np.sum(np.log(lambda_i + alpha_i_prime*B_[:len(t)+1] + alpha_i*A_[:len(t)+1]))

    term2 = -compensator_m4(T, t, t_prime, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime)

    return term1 + term2


In [6]:
# Test m4 likelihood function
beta = 0.01

t = t_per_station[1]
t_prime = t_prime_per_station[1]

time_differences = getTimeDifferences(t, t_prime)

m4_log_likelihood(t, t_prime,time_differences, 0.01, 0.1, 0.01, 0.1, 0.1)

-5699173.13371635

## Finding the parameters using likelihood optimisation

In [7]:
time_diffs = {}
for st_id in tqdm(train_sorted_start_ids):
    t = t_per_station[st_id]
    t_prime = t_prime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_diffs[st_id] = getTimeDifferences(t, t_prime)

100%|██████████| 789/789 [00:13<00:00, 58.18it/s]


In [16]:
optimal_parameters = {}
for st_id in train_sorted_start_ids:
    print(st_id)
    x0 = [np.log(0.1), np.log(1), np.log(0.2), np.log(2), np.log(0.1)] # np.log(rates_dict[station.start_id.unique()[0]])]

    # TODO: What bounds should we use here?
    t = t_per_station[st_id]
    t_prime = t_prime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_differences = time_diffs[st_id]
    op_m4_log_likelihood = lambda x: m4_log_likelihood(t, t_prime, time_differences, np.exp(x[0]), np.exp(x[0]) + np.exp(x[1]) , np.exp(x[2]), np.exp(x[2] + x[3]), np.exp(x[4]))
    sol = op.minimize(op_m4_log_likelihood, x0, method="Nelder-Mead")
    # m4_log_likelihood(t, t_prime, time_differences, alpha_i, beta_i, alpha_i_prime, beta_i_prime, lambda_i):
    #sol = op.minimize(op_m3_likelihood, x0, method="SLSQP")
    if sol.success:
        transformed_alpha = np.exp(sol.x[0])
        transformed_beta = np.exp(sol.x[0]) + np.exp(sol.x[1])
        transformed_alpha_prime = np.exp(sol.x[2])
        transformed_beta_prime = np.exp(sol.x[2]) + np.exp(sol.x[3])
        transformed_lambda = np.exp(sol.x[4])
        max_params = [transformed_alpha, transformed_beta, transformed_alpha_prime, transformed_beta_prime, transformed_lambda]
        optimal_parameters[st_id] = max_params

    else:
        raise OptimizationError(f"Failed to converge for station {station}.")
optimal_parameters

1


  op_m4_log_likelihood = lambda x: m4_log_likelihood(t, t_prime, time_differences, np.exp(x[0]), np.exp(x[0]) + np.exp(x[1]) , np.exp(x[2]), np.exp(x[2] + x[3]), np.exp(x[4]))
  return term1 + term2
  term1 = lambda_i * t_scalar
  np.max(np.abs(fsim[0] - fsim[1:])) <= fatol):


KeyboardInterrupt: 

## Assessing fit for model 4