# QBM: Log Returns

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

from datetime import datetime
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dwave.system import DWaveSampler, FixedEmbeddingComposite
from minorminer import find_embedding
from numba import njit
from scipy.constants import k as k_B, h as h_P

k_B /= h_P * 1e9

from qbm.models import BQRBM
from qbm.plotting import plot_qq
from qbm.utils import (
    binarize_df,
    convert_bin_list_to_str,
    get_binarization_params,
    get_project_dir,
    get_rng,
    kl_divergence,
    load_artifact,
    load_log_returns,
    lr_exp_decay,
    prepare_training_data,
    save_artifact,
    unbinarize_df,
    compute_stats_over_dfs
)

# load anneal schedule data
df_anneal = pd.read_csv(
    project_dir
    / "data/anneal_schedules/csv/09-1265A-A_Advantage_system5_1_annealing_schedule.csv",
    index_col="s",
)
if 0.5 not in df_anneal.index:
    df_anneal.loc[0.5] = (df_anneal.loc[0.499] + df_anneal.loc[0.501]) / 2
    

NameError: name 'project_dir' is not defined

time: 1.97 s (started: 2022-02-08 16:33:50 +01:00)


## Data Loading & Preprocessing

In [None]:
# configuration
model_name = "baseline"
model_id = "01"

project_dir = get_project_dir()
data_dir = project_dir / "data"
models_dir = project_dir / "artifacts/qbm/log_returns/models"
if not models_dir.exists():
    models_dir.mkdir(parents=True)
plots_dir = project_dir / "results/plots/qbm/log_returns"
if not plots_dir.exists():
    plots_dir.mkdir(parents=True)
config_path = models_dir / f"{model_name}/config.json"
config = load_artifact(config_path)

model_params = config["model"]
data_params = config["data"]
if model_id is None:
    model_params["id"] = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}"
else:
    model_params["id"] = model_id
artifacts_dir = models_dir / model_name / model_params["id"]
if not artifacts_dir.exists():
    artifacts_dir.mkdir(parents=True)
config["model"]["id"] = model_params["id"]

rng = get_rng(model_params["seed"])

# data loading
date_format = "%Y-%m-%d"
start_date = datetime.strptime(data_params["start_date"], date_format)
end_date = datetime.strptime(data_params["end_date"], date_format)
if model_params["volatility_indicators"]:
    start_date -= timedelta(days=90)

log_returns = load_log_returns(
    data_params["data_source"],
    start_date=start_date,
    end_date=end_date,
    outlier_threshold=data_params["outlier_threshold"],
)
log_returns_raw = log_returns.copy()

# volatility indicators
volatility_binarized = None
if model_params["volatility_indicators"]:
    volatility_binarized = binarize_volatility(
        compute_rolling_volatility(log_returns, timedelta(days=90))
    )

# data transformation
transformer = None
if model_params["transform"].get("type") is not None:
    if model_params["transform"]["type"] == "quantile":
        transformer = QuantileTransformer(**model_params["transform"]["params"])
        log_returns = pd.DataFrame(
            transformer.fit_transform(log_returns),
            columns=log_returns.columns,
            index=log_returns.index,
        )
    elif model_params["transform"]["type"] == "power":
        transformer = PowerTransformer(
            log_returns, **model_params["transform"]["params"]
        )
        log_returns = transformer.transform(log_returns)

# binarization
binarization_params = get_binarization_params(
    log_returns, n_bits=model_params["n_bits"]
)
log_returns_binarized = binarize_df(log_returns, binarization_params)
model_params["binarization_params"] = binarization_params

# create the training set
training_data = prepare_training_data(log_returns_binarized, volatility_binarized)
X_train = training_data["X_train"]
rng.shuffle(X_train)
model_params["X_train_shape"] = X_train.shape
model_params["columns"] = training_data["columns"]
model_params["split_indices"] = training_data["split_indices"]

# save the config
model_params["n_visible"] = X_train.shape[-1]
model_params["n_qubits"] = model_params["n_visible"] + model_params["n_hidden"]
save_artifact(config, config_path)

## Embedding Generation

In [None]:
generate_embeddings = False
max_chain_length = 7
max_qubits = 400
embedding_ids = range(1, 11)
embeddings_dir = (
    project_dir
    / f"artifacts/qbm/log_returns/embeddings/{model_params['n_visible']}x{model_params['n_hidden']}"
)
embeddings = {}
if generate_embeddings:
    # generate the underlying graphical structure to use for determining the embedding
    qpu = DWaveSampler(**model_params["qpu"])
    source_edgelist = []
    for i in range(model_params["n_visible"]):
        for j in range(model_params["n_visible"], model_params["n_qubits"]):
            source_edgelist.append((i, j))
    _, target_edgelist, target_adjacency = qpu.structure

    # generate embeddings which satisfy the max chain length
    for embedding_id in embedding_ids:
        max_chain_length_satisfied = False
        while not max_chain_length_satisfied:
            # generate embedding
            embedding = find_embedding(source_edgelist, target_edgelist)

            # check max chain length
            for logical_qubit, physical_qubits in embedding.items():
                if len(physical_qubits) > max_chain_length:
                    break
            else:
                if np.sum([len(x) for x in embedding.values()]) <= max_qubits:
                    max_chain_length_satisfied = True

        embeddings[embedding_id] = embedding
        
    # save embeddings
    for embedding_id, embedding in embeddings.items():
        save_artifact(embeddings_dir / f"{embedding_id:02}.json")
else:
    for embedding_path in sorted([x for x in embeddings_dir.iterdir()]):
        embedding_id = int(embedding_path.stem)
        embeddings[embedding_id] = {
            int(k): v for k, v in load_artifact(embedding_path).items()
        }

chain_lengths = {}
for embedding_id, embedding in embeddings.items():
    chain_lengths[embedding_id] = {i: 0 for i in range(1, max_chain_length + 1)}
    for logical_qubit, physical_qubits in embedding.items():
        chain_length = len(physical_qubits)
        chain_lengths[embedding_id][chain_length] += 1
chain_lengths = pd.DataFrame.from_dict(chain_lengths, orient="index")
chain_lengths["n_qubits"] = (chain_lengths * np.arange(1, max_chain_length + 1)).sum(axis=1)
chain_lengths

## Embedding 1 Analysis

[(0, 0), (11.0, 0.55), (11.225, 1)]

time: 5.94 ms (started: 2022-02-08 16:28:15 +01:00)


In [5]:
# set the model params
s_freeze = 1.0
train_model = True
embedding_ids = [1]
n_epochs = 50

# set anneal schedule
t_a = 20
s_pause = 0.55
α_quench = 2
t_pause = round(s_pause * t_a, 3)
Δ_quench = round((1 - s_pause) / α_quench, 3)
if s_pause == 1:
    anneal_schedule = [(0, 0), (t_pause, s_pause)]
else:
    anneal_schedule = [
        (0, 0),
        (t_pause, s_pause),
        (round(t_pause + Δ_quench, 3), 1),
    ]

# train the models
models_annealer = {}
for relative_chain_strength in np.arange(1, 11) / 10:
    # set model name and path
    model_name = f"model_annealer-s_pause={s_pause:.2f}-s={s:.2f}-embedding_{embedding_id:02}"
    model_path = artifact_dir / f"models/embedding_01_rcs/{model_name}.pkl"
    if train_model:
        # model_annealer params
        beta_initial = 0.45
        exact_params = None

        # training params
        n_epochs = 100
        n_samples = 10_000
        learning_rate = 0.1
        mini_batch_size = 10
        epochs = np.arange(1, n_epochs + 1)
        learning_rates = learning_rate * lr_exp_decay(
            epochs, decay_epoch=50, period=10
        )
        learning_rates_beta = learning_rate * lr_exp_decay(
            epochs, decay_epoch=50, period=20
        )

        # set the anneal params
        anneal_params = {
            "s": s_freeze,
            "A": df_anneal.loc[s_freeze, "A(s) (GHz)"],
            "B": df_anneal.loc[s_freeze, "B(s) (GHz)"],
            "schedule": anneal_schedule,
        }

        # skip if model already exists
        if model_path.exists():
            print("Model already exists")
            continue

        # model init
        model_annealer = BQRBM(
            X_train=X_train,
            n_hidden=n_hidden,
            embedding=embedding,
            anneal_params=anneal_params,
            beta_initial=beta_initial,
            exact_params=exact_params,
        )

        # model train and save
        model_annealer.train(
            n_epochs=n_epochs,
            n_samples=n_samples,
            learning_rate=learning_rates,
            learning_rate_beta=learning_rates_beta,
            mini_batch_size=mini_batch_size,
            callback=callback,
        )
        models_annealer[embedding_id, s_pause] = model_annealer
        model_annealer.save(model_path)

        # save attributes as dict in case of error loading old pickled object
        model_annealer_attributes = {
            "A": model_annealer.A,
            "B": model_annealer.B,
            "a": model_annealer.a,
            "b": model_annealer.b,
            "W": model_annealer.W,
            "beta": model_annealer.beta,
            "embedding": model_annealer.embedding,
            "qpu_params": model_annealer.qpu_params,
            "anneal_params": model_annealer.anneal_params,
            "exact_params": model_annealer.exact_params,
            "beta_history": model_annealer.beta_history,
            "callback_outputs": [x for x in model_annealer.callback_outputs],
        }
        save_artifact(
            model_annealer_attributes,
            artifact_dir / f"models/{model_name}-attributes.pkl",
        )

0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1.0
time: 2.59 ms (started: 2022-02-08 16:29:17 +01:00)
