In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as st
from matplotlib.ticker import FuncFormatter
import scipy.optimize as op
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [2]:
station_data = pd.read_csv("../data/santander_locations.csv")
station_data.head() # Load the station data and inspect the first 5 rows
class StationIdError(IndexError):
    """Called when we try and read a non-existing station Id"""
    pass

def get_station_name(in_id):
    """Get station name from bike_data for a given id, catching any exceptions"""
    try:
        return station_data[station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")

In [3]:
bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
bike_data.head() # Load the processed bike data and inspect the first 5 rows

# Find minimum start time
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400

# Substract t_min from start_time and end_time
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60

# Introduce random perturbations to make pseudo-continuous
bike_data["start_time"] = bike_data["start_time"] + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] + np.random.rand(*bike_data["end_time"].shape)

bike_data["duration"] = bike_data.end_time - bike_data.start_time
bike_data = bike_data.sort_values(by=["start_time"])

train_time = 12*7*24*60
train_bike_data = bike_data[bike_data.end_time <= train_time]

train_sorted_stations = []
for st_id in train_bike_data.end_id.sort_values().unique():
    train_sorted_stations.append(train_bike_data[train_bike_data.end_id==st_id])

train_sorted_stations[0].head()

Unnamed: 0,start_id,end_id,start_time,duration,end_time,dist
3657,667,1,503.199289,38.031391,541.23068,8.365682
5041,330,1,548.286623,27.911857,576.19848,4.997948
6023,254,1,586.809052,12.018805,598.827857,0.286054
6570,6,1,613.882481,20.85848,634.740961,2.675239
7252,803,1,646.677042,17.482981,664.160023,2.768124


In [52]:
def N(t_scalar, t):
    """
    Returns the number of times in t less than or equal to t_scalar.
    Is used to compute N(t_i,k) and N'(t_i,k) etc
    """
    counter = 0
    
    for item in t:
        if item > t_scalar:
            break
        counter += 1

    return counter


def B(h, t, t_prime, beta):

    """
    Returns a list of [B_i(1), ..., B_i(h)]

    Note all index variables such as h, k, etc start at 1, like the mathematical notation.
    """
    B = []

    # Append base case B_i(1)
    B.append(np.sum([np.exp(-1*beta*(t[0] - t_prime[k-1])) for k in range(1, N(t[0], t_prime) + 1)]))

    # Append the rest
    for l in range(2, h+1):

        # First term in recursive formula for B_i(h)
        term1 = np.exp(-1*beta*(t[l-1] - t[l-2])) * B[l-2]

        # Second term
        term2 = np.sum([np.exp(-1*beta*(t[l-1] - t_prime[k-1])) for k in range(N(t[l-2], t_prime) + 1, N(t[l-1], t_prime) + 1)])

        B.append(term1 + term2)

    return B


def compensator_better_than_the_carlos_one(t_scalar, t_prime, lambda_i, alpha_i, beta_i):
    """
    t_scalar: scalar value where Lambda_i(t) is to be evaluated
    t_prime: list of arrival times at station i
    """

    term1 = lambda_i * t_scalar
    term2 = -(alpha_i / beta_i) * np.sum([np.exp(-beta_i * (t_scalar - t_prime[k-1])) - 1 for k in range(1, N(t_scalar, t_prime) + 1)])

    return term1 + term2


In [59]:
def m3_log_likelihood(t, t_prime, lambda_i, alpha_i, beta_i):

    """
    Gives log likelihood of our three parameters. 
    t: start times from station i
    t_prime: end times at station i
    """
    
    T = t[-1] # TODO: Is this how we get big T?

    # Get B list 
    B_ = B(len(t), t, t_prime, beta_i)

    term1 = np.sum([np.log(lambda_i + alpha_i*B_[j-1]) for j in range(1, len(t) + 1)])

    term2 = -1 * compensator_better_than_the_carlos_one(T, t_prime, lambda_i, alpha_i, beta_i)

    return term1 + term2



In [61]:
t = train_sorted_stations[0].start_time.to_numpy()
t_prime = train_sorted_stations[0].end_time.to_numpy()

m3_log_likelihood(t, t_prime, 1, 2, 3)

-120848.51942761737