In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [2]:
station_data = pd.read_csv("../data/santander_locations.csv")
station_data.head() # Load the station data and inspect the first 5 rows
class StationIdError(IndexError):
    """Called when we try and read a non-existing station Id"""
    pass

def get_station_name(in_id):
    """Get station name from bike_data for a given id, catching any exceptions"""
    try:
        return station_data[station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")

In [3]:
bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
bike_data.head() # Load the processed bike data and inspect the first 5 rows

# Find minimum start time
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400

# Substract t_min from start_time and end_time
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60

# Introduce random perturbations to make pseudo-continuous
bike_data["start_time"] = bike_data["start_time"] + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] + np.random.rand(*bike_data["end_time"].shape)

bike_data["duration"] = bike_data.end_time - bike_data.start_time
bike_data = bike_data.sort_values(by=["start_time"])

train_time = 12*7*24*60
train_bike_data = bike_data[bike_data.end_time <= train_time]

train_sorted_stations = []
for st_id in train_bike_data.end_id.sort_values().unique():
    train_sorted_stations.append(train_bike_data[train_bike_data.end_id==st_id])

train_sorted_stations[0].head()

Unnamed: 0,start_id,end_id,start_time,duration,end_time,dist
3657,667,1,503.193057,38.051515,541.244572,8.365682
5041,330,1,548.755107,28.190526,576.945633,4.997948
6023,254,1,586.654092,12.293693,598.947785,0.286054
6570,6,1,613.857809,21.086535,634.944344,2.675239
7252,803,1,646.740128,18.066621,664.806748,2.768124


In [10]:
def trueB(beta, h, t, t_prime):
    # Computes B_i(h) via slow method, is used only for case h = 1

    counter = 0
    for item in t_prime:
        if item > t[h-1]:
            break
        counter += 1
    upper = counter
    return np.sum([np.exp(-1*beta*(t[h-1] - t_prime[k-1])) for k in range(1, upper+1)])

def B(beta, h, t, t_prime):
    # Computes B_i(h) with recursion
    # TODOOOOOOOOOOOOOOOOOOOOOO: WE ARE NOT USING RECURSION, make it more efficient

    if h == 1:
        return trueB(beta, 1, t, t_prime)

    term1 = np.exp(-1*beta*(t[h-1] - t[h-2])) * B(beta, h-1, t, t_prime)
    # Now want number of elements in t_prime smaller than t[h-1] and than t[h-2]

    counter = 0
    # counter represents N'(t[h-1]), the number of arrival times at station i less than or equal to t[h-1]
    for item in t_prime:
        if item > t[h-1]:
            break
        counter += 1

    upper = counter

    counter2 = 0
    for item in t_prime:
        if item > t[h-2]:
            break
        counter2 += 1

    lower = counter2 + 1

    # TODOOOOOOOOOOOOOOOOOOOOO OPTIMISE THE BELOW LINE, NO FOR LOOP
    term2 = np.sum([np.exp(-1*beta*(t[h-1] - t_prime[k-1])) for k in range(lower, upper+1)])


    return term1 + term2


def compensator_better_than_the_carlos_one(t_scalar, t_prime, lambda_i, alpha_i, beta_i):
    """
    t_scalar: scalar value where Lambda_i(t) is to be evaluated
    t_prime: list of arrival times at station i
    """
    counter = 0
    for item in t_prime:
        if item > t_scalar:
            break
        counter += 1

    term1 = lambda_i * t_scalar
    term2 = -(alpha_i / beta_i) * np.sum([np.exp(-beta_i * (t_scalar - t_prime[k-1])) for k in range(1, counter + 1)])

    return term1 + term2


In [18]:
def m3_log_likelihood(t, t_prime, lambda_i, alpha_i, beta_i):
    T = t[-1] # TODO: Is this how we get big T?

    counter = 0 # This holds N_i(T)
    for item in t:
        if item > T:
            break
        counter += 1

    term1 = np.sum([np.log(lambda_i + alpha_i*B(beta_i, j, t, t_prime)) for j in range(1, counter+1)])

    term2 = -1 * compensator_better_than_the_carlos_one(T, t_prime, lambda_i, alpha_i, beta_i)

    return term1 + term2



In [19]:
t = train_sorted_stations[0].start_time.to_numpy()
t_prime = train_sorted_stations[0].end_time.to_numpy()

m3_log_likelihood(t, t_prime, 0.1, 0.2, 0.3)

HERE
ASFDF


-12084.006174572245