In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [3]:
station_data = pd.read_csv("../data/santander_locations.csv")
station_data.head() # Load the station data and inspect the first 5 rows
class StationIdError(IndexError):
    """Called when we try and read a non-existing station Id"""
    pass

def get_station_name(in_id):
    """Get station name from bike_data for a given id, catching any exceptions"""
    try:
        return station_data[station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")

In [5]:
bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
bike_data.head() # Load the processed bike data and inspect the first 5 rows

# Find minimum start time
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400

# Substract t_min from start_time and end_time
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60

# Introduce random perturbations to make pseudo-continuous
bike_data["start_time"] = bike_data["start_time"] + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] + np.random.rand(*bike_data["end_time"].shape)

bike_data["duration"] = bike_data.end_time - bike_data.start_time
bike_data = bike_data.sort_values(by=["start_time"])

train_time = 12*7*24*60
train_bike_data = bike_data[bike_data.end_time <= train_time]

train_sorted_stations = []
for st_id in train_bike_data.end_id.sort_values().unique():
    train_sorted_stations.append(train_bike_data[train_bike_data.end_id==st_id])

train_sorted_stations[0].head()

Unnamed: 0,start_id,end_id,start_time,duration,end_time,dist
3657,667,1,503.179481,38.753114,541.932595,8.365682
5041,330,1,548.914156,27.368573,576.282729,4.997948
6023,254,1,586.823051,11.721097,598.544148,0.286054
6570,6,1,613.184308,21.030599,634.214907,2.675239
7252,803,1,646.771823,17.843453,664.615276,2.768124


In [8]:
def genA(i, beta, t):
    if i == 0:
        return 0
    else:
        return np.exp(-1*beta*(t[i] - t[i-1]))*(1+genA(i-1, beta, t))

def hawkes_log_likelihood(t, alpha, beta, lambda_p): 
    A = []
    for i in range(0, len(t)):
        if i==0:
            A.append(0)
        else:
            A.append(np.exp(-1*beta*(t[i] - t[i-1]))*(1+A[i-1]))
    l = 0
    for i in range(0, len(t)):
        l += np.log(lambda_p + alpha*A[i]) + (alpha/beta) * (np.exp(-beta*(t[-1] - t[i])) - 1)
    l -= lambda_p * t[-1]
    return l

hawkes_log_likelihood(train_sorted_stations[0].start_time.to_numpy(),1,1,1)


-122143.1915210818