In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from statsmodels.graphics.gofplots import qqplot
from kalman_predictor import *

df = pd.read_csv("../data/TAQ_30Min_AAPL_2023_normalized.csv")
df.index = df.datetime
df["Log_Turnover"] = np.log(df["Normalized_TURNOVER"])
df.info()

## Preliminary: QQ plot
Since the Kalman filter paper has one

In [None]:
qq_nolog = qqplot(df["Normalized_TURNOVER"], line="s")
plt.tight_layout()
plt.show()
plt.savefig("nolog_qqplot.pdf")

In [None]:
qq_log = qqplot(np.log(df["Normalized_TURNOVER"]), line="s")
plt.tight_layout()
plt.show()
plt.savefig("log_qqplot.pdf")

In [None]:
C = np.ones((1,2))
C.shape

## Kalman filtering

In [None]:
# set up and run a dimensional test
y_1 = df.head(1)["Log_Turnover"]
x_1 = np.reshape(np.array([y_1/2, y_1/2]), 2)
Sigma_1 = np.eye(2)

theta = Params(x_1, np.identity(2)*0.5, 1.0, 1.0, 0.0025, 0.0025, 0.0005, np.array([0.6, 0.25, 0.0, -0.15, -0.3, -0.45, -0.5, -0.6, -0.5, -0.25, -0.3, -0.1, 0.4]))
predictor = KalmanPredictor(theta)

y_plus = predictor.predict_alike(y_1, x_t=x_1, start_time=0)
print("Shape should be {}: x.shape = {}".format(1, x_plus.shape))

## Kalman smoothing

In [None]:
ys = df["Log_Turnover"].to_numpy()

N_train = 50

x_tau_n, Sigma_tau_n, _, _, _ = kalman_smoothing(x_1, ys[0:N_train], Sigma_1, theta)
# dimensional check again
print(x_tau_n.shape)
print(Sigma_tau_n.shape)

## Expectation maximization
In this step we want to predict $x_\tau = [\eta_\tau\ \mu_\tau]^\top \in \mathbb{R}^2$ which is the hidden state vector. The variables $\eta_\tau$ and $\mu_\tau$ are the daily average and intraday dynamic part of the log volume.

In [None]:
test_params = em(x_1, ys[1:N_train*13], theta, maxsteps=25, tol=0.05)

In [None]:
predictor = KalmanPredictor(test_params) # we already trained them in the previous cell, holdover from previous code

## Test Kalman filter with given params

In [None]:
N = df["Log_Turnover"].size

In [None]:
y_t = df.iloc[0:1]["Log_Turnover"]
x_t = np.reshape(np.array([y_t/2, y_t/2]), 2)
xs = [x_t,]

Sigma_t = np.identity(2)
sigmas = [Sigma_t,]

y_pred = [y_t]
y_pred = predictor.predict_alike(df["Log_Turnover"])

In [None]:
errs = [np.mean(np.square(y_pred[i] - df.iloc[i:i+1]["Log_Turnover"])) for i in range(N)]

In [None]:
y_t = df.iloc[0:1]["Log_Turnover"]
x_t = np.reshape(np.array([y_t/2, y_t/2]), 2)

Sigma_t = np.identity(2)

y_pred_new = [y_t]
I = 13
for i in range(N):
    y_t = df.iloc[i:i+1]["Log_Turnover"]
    x_plus, Sigma_plus = kalman_filtering(i, x_t, y_t, Sigma_t, test_params)
    y_pred_new.append((C@x_plus)[0] + test_params.phi[i%I])

In [None]:
errs_new = [np.mean(np.square(y_pred_new[i] - df.iloc[i:i+1]["Log_Turnover"])) for i in range(N)]

### Comparison: hourly average over the year

In [None]:
log_avgs = np.log(df["Hourly averages"])
errs_avg = [np.mean(np.square(log_avgs - df.iloc[i:i+1]["Log_Turnover"])) for i in range(N)]

In [None]:
#plt.semilogy(np.mean(np.reshape(errs, (int(len(errs)/I), I)), axis=1), label="Average daily prediction error")
plt.xlabel("Day of year"); plt.ylabel("MSE error")
plt.semilogy(np.mean(np.reshape(errs_new, (int(len(errs_new)/I), I)), axis=1), label="Daily prediction error (Kalman)")
plt.semilogy(np.mean(np.reshape(errs_avg, (int(len(errs_avg)/I), I)), axis=1), linestyle="--", label="Daily prediction error (average)")

plt.legend()
plt.savefig("kalman_errors_year.pdf")
print("Average error: {}".format(np.mean(errs_avg)))
print("Kalman error: {}".format(np.mean(errs_new)))

In [None]:
fig, axs = plt.subplots(5, sharex=True, figsize=(5,7))

axs[0].semilogy(np.exp(y_pred_new[-I:]),label="Predicted turnover (Kalman)", linestyle="-.", color="cornflowerblue")
axs[0].plot(np.exp(df.iloc[-I:]["Log_Turnover"].to_numpy()), label="True normalized turnover", color="black")
axs[0].plot(df.iloc[-I:]["Hourly averages"].to_numpy(), label="Predicted turnover (mean)", color="orange", linestyle="--")

for i in range(1,5):
    axs[i].semilogy(np.exp(y_pred_new[-I*(i+1):-I*i]), linestyle="-.", color="cornflowerblue")
    axs[i].plot(np.exp(df.iloc[-(i+1)*I:-i*I]["Log_Turnover"].to_numpy()), color="black")
    axs[i].plot(df.iloc[-(i+1)*I:-i*I]["Hourly averages"].to_numpy(), color="orange", linestyle="--")

daily_labels = ["9:30", "10:30", "11:30", "12:30", "13:30", "14:30", "15:30",]
plt.xticks(ticks=range(0, I, 2), labels=daily_labels)
fig.legend(loc="upper center"); plt.ylabel("Normalized turnover")
fig.tight_layout()
fig.savefig("kalman_prediction.pdf")