# Hawkes fit to start times
Load the imports:

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

Load the station data:

In [2]:
station_data = pd.read_csv("../data/santander_locations.csv")
station_data.head() # Load the station data and inspect the first 5 rows
class StationIdError(IndexError):
    """Called when we try and read a non-existing station Id"""
    pass

def get_station_name(in_id):
    """Get station name from bike_data for a given id, catching any exceptions"""
    try:
        return station_data[station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")

Preprocess the start times to make them pseudo-continuous:

In [20]:
bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
bike_data.head() # Load the processed bike data and inspect the first 5 rows

# Find minimum start time
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400

# Substract t_min from start_time and end_time
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60

# Introduce random perturbations to make pseudo-continuous
bike_data["start_time"] = bike_data["start_time"] + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] + np.random.rand(*bike_data["end_time"].shape)

bike_data["duration"] = bike_data.end_time - bike_data.start_time
bike_data = bike_data.sort_values(by=["start_time"])

train_time = 12*7*24*60
train_bike_data = bike_data[bike_data.start_time <= train_time]
train_bike_data.head()

train_sorted_stations = []
for st_id in train_bike_data.start_id.sort_values().unique():
    train_sorted_stations.append(train_bike_data[train_bike_data.start_id==st_id])

sorted_start_times = []

for station in train_sorted_stations:
    sorted_start_times.append(station.start_time.to_numpy())

train_sorted_stations[0].head()

Unnamed: 0,start_id,end_id,start_time,duration,end_time,dist
1512,1,71,425.422115,7.901661,433.323776,1.718483
3498,1,3,498.174873,20.456061,518.630934,1.964612
5961,1,330,584.8165,28.111503,612.928003,4.997948
5977,1,433,585.173663,3.185362,588.359025,0.673108
6447,1,803,607.917764,14.576395,622.494159,2.768124


We will use the following kernel:
$$
\mu_i(t) = \alpha_i e^{-\beta_i t},\, 0 \leq \alpha_i \leq \beta_i
$$

In [32]:
def A(i, beta, t):
    if i == 0:
        return 0
    else:
        return np.exp(-1*beta*(t[i] - t[i-1]))*(1+A(i-1, beta, t))

def hawkes_likelihood(t, alpha, beta, lambda_p): 
    l = 0

    for i in range(0, len(t)):
        if lambda_p + alpha*A(i, beta, t) < 0:
            print("Stop")
        l += np.log(lambda_p + alpha*A(i, beta, t)) + (alpha/beta) * np.exp(-beta*(t[-1] - t[i])) - 1

    l -= lambda_p * t[-1]
    return l

hawkes_likelihood(sorted_start_times[0],1,1,1)


-122438.30499668307

In [None]:
# log(alpha) = log(beta + alpha)

In [33]:
zeroth_station = sorted_start_times[0]
op_hawkes_likelihood = lambda x: -hawkes_likelihood(zeroth_station, x[0], x[1], x[2])
x0 = [2,2,2]
zeroth_sol = op.minimize(op_hawkes_likelihood, x0)
min_params_zeroth = zeroth_sol.x
min_params_zeroth