In [22]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
from tqdm import tqdm
#from preprocessing import *
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [2]:
station_data = pd.read_csv("../data/santander_locations.csv")


class OptimizationError(RuntimeError):
    """Called when optimizer does not converge."""
    pass

class StationIdError(IndexError):
    """Called when we try and read a non-existing station id."""
    pass


def get_station_name(in_id):
    """Get station name from bike_data for a given id."""
    try:
        return station_data[
            station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")


bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60
bike_data["start_time"] = bike_data["start_time"] \
    + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] \
    + np.random.rand(*bike_data["end_time"].shape)
bike_data["duration"] = bike_data.end_time - bike_data.start_time
bike_data = bike_data.sort_values(by=["start_time"])

train_time = 12*7*24*60
train_bike_data = bike_data[bike_data.start_time <= train_time]
test_bike_data = bike_data[bike_data.start_time > train_time]
train_sorted_stations_start = []
for st_id in train_bike_data.start_id.sort_values().unique():
    train_sorted_stations_start.append(
        train_bike_data[train_bike_data.start_id == st_id]
        )
test_sorted_stations = []
for st_id in test_bike_data.start_id.sort_values().unique():
    test_sorted_stations.append(
        test_bike_data[test_bike_data.start_id == st_id]
        )
rates_dict = {}
for station in test_sorted_stations:
    time_elapsed = station.start_time.to_numpy()[-1] \
        - station.start_time.to_numpy()[0]
    n_events = test_sorted_stations[0].size
    rate = n_events / time_elapsed

    rates_dict[station.start_id.unique()[0]] = rate
station_array = list(rates_dict.keys())


def ecdf(data):
    # https://cmdlinetips.com/2019/05/empirical-cumulative-distribution-function-ecdf-in-python/
    """ Compute ECDF """
    x = np.sort(data)
    n = x.size
    y = np.arange(1, n+1) / n
    return(x, y)


tprime_per_station = {}
for id in bike_data.end_id.unique():
    unsorted_station_end_time = bike_data[bike_data.end_id == id]
    sorted_station_end_time = unsorted_station_end_time.sort_values(
        by=["end_time"])
    tprime_per_station[id] = sorted_station_end_time.\
        end_time.to_numpy()
tprime_per_station

t_per_station = {}
for id in bike_data.start_id.unique():
    unsorted_station_start_time = bike_data[bike_data.start_id == id]
    sorted_station_start_time = unsorted_station_start_time.sort_values(
        by=["start_time"])
    t_per_station[id] = sorted_station_start_time.\
        start_time.to_numpy()

sorted_start_ids = np.sort(bike_data.start_id.unique())

In [3]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_{i,k}) and N'(t_{i,k}) depending on whether t above is t or t_prime
    """

    return np.searchsorted(t, t_scalar, side="right")

def getTimeDifferences(t, t_prime):
    """
    Input: (sorted) times for a particular station i
    Output: List of differences indexed by [h][k] for this station i
    """

    # h goes until N(t[-1], t) assuming T = t[-1]
    T = t[-1]
    D_result = []
    for h in range(1, N(T, t)+1):
        differences_list = []
        # Construct list of t_ih - t'_ik for k = 1 to N'(T)
        differences_list.append(t[h-1] - t_prime[N(t[h-2], t_prime):N(t[h-1], t_prime)])

        D_result.append(np.array(differences_list))

    return D_result

getTimeDifferences(np.array([1,2,3]),np.array([1.1,2.2,3.3]))

[array([], shape=(1, 0), dtype=float64), array([[0.9]]), array([[0.8]])]

In [4]:
def compensator_m4_t_values(t_scalar, t, t_prime):
    """
    Computes the lists that are required for the compensator function outside of it (without the parameters)
    """
    return t_scalar - t_prime[:N(t_scalar, t_prime)], t_scalar - t[:N(t_scalar, t)]

In [5]:
def compensator_m4(t_scalar, t, t_prime, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime):
    """
    t_scalar: scalar value where Lambda_i(t) is to be evaluated
    t_prime: list of arrival times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE.
    """

    term1 = lambda_i * t_scalar
    term2 = -(alpha_i_prime / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_scalar - t_prime[:N(t_scalar, t_prime)]))-1)
    term3 = -(alpha_i / beta_i) * np.sum(np.exp(-beta_i * (t_scalar - t[:N(t_scalar, t)]))-1)
    return term1 + term2 + term3


In [9]:
def new_compensator_m4(t_scalar, t_precomputed, t_prime_precomputed, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime):
    """
    Evaluates compensator using already pre-computed list terms so we don't compute it every time.
    """

    term1 = lambda_i * t_scalar
    term2 = -(alpha_i_prime / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_prime_precomputed))-1)
    term3 = -(alpha_i / beta_i) * np.sum(np.exp(-beta_i * (t_precomputed))-1)
    return term1 + term2 + term3

def new_m4_log_likelihood(t, t_prime, t_precomputed, t_prime_precomputed,
 alpha_i, beta_i, alpha_i_prime, beta_i_prime, lambda_i, time_differences):
    """
    Gives log likelihood of our five parameters. 
    t: start times from station i
    t_prime: end times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE
    """
    
    T = t[-1] # TODO: Is this how we get big T?

    # Get A list
    A_ = new_A(len(t), t, beta_i)

    # Get B list 
    B_ = new_B(len(t), t, t_prime, beta_i_prime, time_differences)

    term1 = np.sum(np.log(lambda_i + alpha_i_prime*B_[:len(t)+1] + alpha_i*A_[:len(t)+1]))

    term2 = -new_compensator_m4(T, t_precomputed, t_prime_precomputed, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime)

    return term1 + term2

In [7]:
def new_B(h, t, t_prime, beta, time_differences):

    """
    Returns a list of [B_i(1), ..., B_i(h)]

    NOTE: t_prime NEEDS to be sorted here

    time_differences: time differences double list for station i

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """
    B = [np.sum(np.exp(-beta*(t[0] - t_prime[:N(t[0], t_prime)])))]

    # Append the rest
    for l in range(2, h+1):
        # First term in recursive formula for B_i(h)
        term1 = np.exp(-beta*(t[l-1] - t[l-2])) * B[l-2]
        term2 = np.sum(np.exp(-beta*(time_differences[l-1])))
        B.append(term1 + term2)
    return np.array(B)


def new_A(h, t, beta):

    A = []
    for i in range(1, h+1):
        if i==1:
            A.append(0)
        else:
            A.append(np.exp(-1*beta*(t[i-1] - t[i-2]))*(1+A[i-2]))
    return np.array(A)


def m4_log_likelihood(t, t_prime, alpha_i, beta_i, alpha_i_prime, beta_i_prime, lambda_i, time_differences):
    """
    Gives log likelihood of our five parameters. 
    t: start times from station i
    t_prime: end times at station i

    NOTE: t_prime NEEDS TO BE SORTED HERE
    """
    
    T = t[-1] # TODO: Is this how we get big T?

    # Get A list
    A_ = new_A(len(t), t, beta_i)

    # Get B list 
    B_ = new_B(len(t), t, t_prime, beta_i_prime, time_differences)

    term1 = np.sum(np.log(lambda_i + alpha_i_prime*B_[:len(t)+1] + alpha_i*A_[:len(t)+1]))

    term2 = -compensator_m4(T, t, t_prime, lambda_i, alpha_i, beta_i, alpha_i_prime, beta_i_prime)

    return term1 + term2


## CARLOS: To use the pre-computed times, do the following to evaluate the log-likelihood inside the for loop for each station

In [None]:
t = t_per_station[st_id]
t_prime = tprime_per_station[st_id] # Need to sort t_prime for likelihood function
time_differences = time_diffs[st_id]

# Compute precomputed lists
t_precomputed, t_prime_precomputed = compensator_m4_t_values(end_T, t, t_prime)

# Use likelihood with pre computed lists
op_m3_likelihood = lambda x: -new_m4_log_likelihood(t, t_prime, t_precomputed, t_prime_precomputed, np.exp(x[0]), np.exp(x[0]) + np.exp(x[1]), np.exp(x[2]), time_differences)

In [13]:
# Test m4 likelihood function
beta = 0.01

t = t_per_station[1]
t_prime = tprime_per_station[1]

time_differences = getTimeDifferences(t, t_prime)
t_precomputed, t_prime_precomputed = compensator_m4_t_values(t[-1], t, t_prime)

print(m4_log_likelihood(t, t_prime, 0.01, 0.1, 0.01, 0.1, 0.1, time_differences))
new_m4_log_likelihood(t, t_prime, t_precomputed, t_prime_precomputed, 0.01, 0.1, 0.01, 0.1, 0.1, time_differences)


-24355.750391336445


-24355.750391336445

## Finding the parameters using likelihood optimisation

In [14]:
time_diffs = {}
for st_id in sorted_start_ids:
    t = t_per_station[st_id]
    t_prime = tprime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_diffs[st_id] = getTimeDifferences(t, t_prime)

In [16]:
model_2_params = pd.read_csv("../data/N_M_params.csv", index_col=0)
model_3_params = pd.read_csv("../data/M3_train_params.csv", index_col=0)

In [29]:
optimal_parameters = {}
for st_id in sorted_start_ids[:2]:
    
    print(st_id)
    x0 = -np.ones(5) * 3

    # TODO: What bounds should we use here?
    t = t_per_station[st_id]
    t_prime = tprime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_differences = time_diffs[st_id]
    t_precomputed, t_prime_precomputed = compensator_m4_t_values(t[-1], t, t_prime)

    op_m4_log_likelihood = lambda x: -new_m4_log_likelihood(t, t_prime, t_precomputed, t_prime_precomputed,
        np.exp(x[0]), np.exp(x[0]) + np.exp(x[1]), np.exp(x[2]), np.exp(x[2] + x[3]), np.exp(x[4]), time_differences)
    
    #op_m4_log_likelihood = lambda x: -m4_log_likelihood(t, t_prime, np.exp(x[0]),
    #     np.exp(x[0]) + np.exp(x[1]), np.exp(x[2]), np.exp(x[2] + x[3]), np.exp(x[4]), time_differences)

    sol = op.minimize(op_m4_log_likelihood, x0, method="Nelder-Mead")

    if sol.success:
        transformed_alpha = np.exp(sol.x[0])
        transformed_beta = np.exp(sol.x[0]) + np.exp(sol.x[1])
        transformed_alpha_prime = np.exp(sol.x[2])
        transformed_beta_prime = np.exp(sol.x[2]) + np.exp(sol.x[3])
        transformed_lambda = np.exp(sol.x[4])
        max_params = [transformed_alpha, transformed_beta, transformed_alpha_prime, transformed_beta_prime, transformed_lambda]
        optimal_parameters[st_id] = max_params

    else:
        raise OptimizationError(f"Failed to converge for station {st_id}.")
optimal_parameters

1
2


{1: [0.015159431533715118,
  0.016239850865439456,
  0.023225018835280133,
  15.686985160762877,
  0.002914192741909464],
 2: [0.16727100089829353,
  1.0848695552106118,
  0.013800627906146298,
  1.2454693176791114,
  0.0013461239097086856]}

In [30]:
optimal_parameters = {}
for st_id in sorted_start_ids[:2]:
    
    print(st_id)
    x0 = np.exp(-np.ones(5) * 3)

    # TODO: What bounds should we use here?
    t = t_per_station[st_id]
    t_prime = tprime_per_station[st_id] # Need to sort t_prime for likelihood function
    time_differences = time_diffs[st_id]
    t_precomputed, t_prime_precomputed = compensator_m4_t_values(t[-1], t, t_prime)

    op_m4_log_likelihood = lambda x: -new_m4_log_likelihood(t, t_prime, 
        t_precomputed, t_prime_precomputed, x[0], x[1], x[2], x[3], x[4], time_differences)
    
    #op_m4_log_likelihood = lambda x: -m4_log_likelihood(t, t_prime, np.exp(x[0]),
    #     np.exp(x[0]) + np.exp(x[1]), np.exp(x[2]), np.exp(x[2] + x[3]), np.exp(x[4]), time_differences)

    sol = op.minimize(op_m4_log_likelihood, x0, method="Nelder-Mead")

    if sol.success:
        optimal_parameters[st_id] = x0

    else:
        raise OptimizationError(f"Failed to converge for station {st_id}.")
optimal_parameters

1


  A.append(np.exp(-1*beta*(t[i-1] - t[i-2]))*(1+A[i-2]))
  A.append(np.exp(-1*beta*(t[i-1] - t[i-2]))*(1+A[i-2]))
  term1 = np.exp(-beta*(t[l-1] - t[l-2])) * B[l-2]
  term1 = np.exp(-beta*(t[l-1] - t[l-2])) * B[l-2]
  term2 = np.sum(np.exp(-beta*(time_differences[l-1])))
  term1 = np.sum(np.log(lambda_i + alpha_i_prime*B_[:len(t)+1] + alpha_i*A_[:len(t)+1]))
  term2 = -(alpha_i_prime / beta_i_prime) * np.sum(np.exp(-beta_i_prime * (t_prime_precomputed))-1)
  term3 = -(alpha_i / beta_i) * np.sum(np.exp(-beta_i * (t_precomputed))-1)


KeyboardInterrupt: 

## Assessing fit for model 3