In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("data/TAQ_30Min_AAPL_2023_normalized.csv")
df.index = df.datetime
df["Log_Turnover"] = np.log(df["Normalized_TURNOVER"])
T = 13
df.info()

In [None]:
class Params:
    def __init__(self, pi, Sigma, a_eta, a_mu, sigma_eta_sq, sigma_mu_sq, r, phi):
        # pi and Sigma go into $x_t ~ \mathcal{N}(\pi_t, \Sigma_t)$
        self.pi = pi
        self.Sigma = Sigma
        # a_eta and a_mu define the state transition matrix A = [a_eta 0; 0 a_mu]
        self.a_eta = a_eta
        self.a_mu = a_mu
        # sigma_eta and sigma_mu define the covariance matrix Q = [sigma_eta^2 0; 0 sigma_mu^2]
        # for the Gaussian noise in the state transition w_t ~ \mathcal{N}(0, Q_t)
        self.sigma_eta_sq = sigma_eta_sq
        self.sigma_mu_sq = sigma_mu_sq
        # r goes into v_t ~ \mathcal{N}(0,r) where v_t is the noise in observation t
        self.r = r
        # phi is the seasonality parameter.
        # It's a vector in $\mathbb{R}^T$ where T is the number of intraday observations in a day
        self.phi = phi
    
    def A(self):
        return np.vstack([np.hstack([np.eye(T)*self.a_eta, np.zeros((T,T))]),
                          np.hstack([np.zeros((T,T)), np.eye(T)*self.a_mu])])
    
    def Q(self):
        return np.vstack([np.hstack([np.eye(T)*self.sigma_eta_sq, np.zeros((T,T))]),
                          np.hstack([np.zeros((T,T)), np.eye(T)*self.sigma_mu_sq])])

In [None]:
C = np.hstack([np.eye(T), np.eye(T)])
C.shape

In [None]:
# test
theta = Params(np.zeros(2), np.identity(2)*0.5, 1.0, 1.0, 0.0025, 0.0025, 0.0005, np.array([0.6, 0.25, 0.0, -0.15, -0.3, -0.45, -0.5, -0.6, -0.5, -0.25, -0.3, -0.1, 0.4]))

## Expectation maximization
In this step we want to predict $x_\tau = [\eta_\tau\ \mu_\tau]^\top \in \mathbb{R}^2$ which is the hidden state vector. The variables $\eta_\tau$ and $\mu_\tau$ are the daily average and intraday dynamic part of the log volume.

In [None]:
def kalman_filtering(x_hat_tau, y_tau_plus, Sigma_tau_tau, params):
    A = params.A()
    x_hat_tau_plus = A @ x_hat_tau # predict mean
    Sigma_tau_plus = A @ Sigma_tau_tau @ A.T + params.Q() # predict covariance
    
    # compute Kalman gain
    K_tau_plus = Sigma_tau_plus @ C.T @ np.linalg.inv(C @ Sigma_tau_plus @ C.T + params.r)
    
    # correct conditional mean
    x_hat_next = x_hat_tau_plus + K_tau_plus @ (y_tau_plus - params.phi - C@x_hat_tau_plus)
    Sigma_next = Sigma_tau_plus - K_tau_plus @ C @ Sigma_tau_plus
    #print("x_hat_next", x_hat_next.shape, "Sigma_next", Sigma_next.shape)
    return x_hat_next, Sigma_next

In [None]:
# set up and run a dimensional test
y_1 = df.head(T)["Log_Turnover"]
x_1 = np.reshape(np.array([y_1/2, y_1/2]), 2*T)
Sigma_1 = np.eye(2*T)
x_plus, Sigma_plus = kalman_filtering(x_1, y_1, Sigma_1, theta)
print("Shape should be {}: x.shape = {}".format(2*T, x_plus.shape))
print("Shape should be {} x {}: Sigma.shape = {}".format(2*T, 2*T, Sigma_plus.shape))

In [None]:
def kalman_smoothing(x_t, ys, Sigma_t, params):
    # this uses the outputs from the filtering algorithm
    # NOTE THAT x_t is a shorthand in the next few lines for x_{t|t} and Sigma_t := Sigma_{t|t}
    N = ys.shape[0]
    x_ts = []
    Sigma_ts = []
    
    # this is an unsightly way to code it but I think it makes more sense
    for t in range(0, N):
        x_t, Sigma_t = kalman_filtering(x_t, ys[t,:], Sigma_t, params)
        x_ts.append(x_t)
        Sigma_ts.append(Sigma_t)
        
    x_N, Sigma_N = x_ts[-1], Sigma_ts[-1]
    # Now we have x_{N|N}, Sigma_{N|N}
    
    x_tau_n = x_N # this is the initialization of x_{t+1|N} and Sigma_{t+1|N}
    Sigma_tau_n = Sigma_N
    
    for t in range(N-1, 0, -1):
        A = params.A()
        # in here, Sigma_ts[t-1] is Sigma_{t|t} because of 0-indexing
        Sigma_tau_plus = A @ Sigma_ts[t-1] @ A.T + params.Q()
        x_hat_tau_plus = A @ x_ts[t-1]
        
        Lt = Sigma_ts[t-1] @ A.T @ np.linalg.inv(Sigma_tau_plus)
        x_tau_n = x_ts[t-1] + Lt @ (x_tau_n - x_hat_tau_plus)
        Sigma_tau_n = Sigma_ts[t-1] + Lt @ (Sigma_tau_n - Sigma_tau_plus) @ Lt.T
    return x_tau_n, Sigma_tau_n # this is x_{t|N} and Sigma_{t|N}

In [None]:
ys = df["Log_Turnover"].to_numpy()
ys = np.reshape(ys, (int(ys.shape[0]/T), T))
ys.shape
N_train = 110 # about half the data
x_tau_n, Sigma_tau_n = kalman_smoothing(x_1, ys[0:N_train,:], Sigma_1, theta)
# dimensional check again
print(x_tau_n.shape)
print(Sigma_tau_n.shape)

In [None]:
def em(params_0; maxsteps=10, tol=1e-3)
    i = 0; err = np.Inf
    while i < maxsteps or err < tol:
        # Iteratively do it
        for tau in range(N-1, 0, -1):
            # TO DO