In [None]:
# =========================
# Imports and basic setup
# =========================

import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
import pickle
import iisignature

from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, roc_auc_score

from sklearn.svm import SVR

import sigkernel

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

params = {
    'legend.fontsize': 'large',
    'figure.figsize': (16, 5),
    'axes.labelsize': 'large',
    'axes.titlesize': 'large',
    'xtick.labelsize': 'large',
    'ytick.labelsize': 'large'
}
pylab.rcParams.update(params)


In [None]:
# =========================
# Error function
# =========================
# In the Bitcoin notebook this was a true MAPE, but that breaks for 0 labels.
# Here we just use plain MAE on 0/1 labels, but keep the function name
# so the rest of the code changes minimally.

def mean_absolute_percentage_error(y_true, y_pred):
    """
    For our binary 0/1 labels, this is actually mean absolute error.
    """
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    return np.mean(np.abs(y_true - y_pred))


In [None]:
# =========================
# Helper: windowing for multivariate account data
# =========================
# X_account: numpy array of shape (T, M) with binary 0/1 interest per CUSIP.
# target_cusip_idx: integer index of the CUSIP we want to predict.
# h_window: length of the historical window (in time steps).

def build_account_windows(X_account, target_cusip_idx, h_window=36):
    """
    Build sliding windows for a single account.

    Input:
        X_account: array of shape (T, M)
        target_cusip_idx: int, column index of target CUSIP
        h_window: int, number of past steps in each input window

    Returns:
        X_windows: array of shape (N, h_window, M)
        y: array of shape (N,) with 0/1 labels for target CUSIP at t+1
    """
    T, M = X_account.shape
    X_list = []
    y_list = []

    # We need at least h_window + 1 steps to have one input and next-step label
    for t in range(h_window, T - 1):
        # window of past h_window steps
        X_list.append(X_account[t - h_window:t, :])
        # label: interest in target CUSIP at t+1 (next time step)
        y_list.append(X_account[t + 1, target_cusip_idx])

    X_windows = np.array(X_list, dtype=float)     # (N, h_window, M)
    y = np.array(y_list, dtype=float).reshape(-1) # (N,)

    return X_windows, y


In [None]:
# =========================
# (Optional) Plot helper
# =========================
# If you ever want to visualise predicted probabilities vs true 0/1 labels.

def PlotResult(y_train, y_test, y_train_predict, y_test_predict, name):

    train_len = len(y_train)
    test_len = len(y_test)

    fig, ax = plt.subplots(1, figsize=(12, 5))

    ax.plot(y_train_predict, color='red', label='Train predicted p(interest)')
    ax.plot(range(train_len, train_len + test_len),
            y_test_predict,
            color='red', linestyle='--',
            label='Test predicted p(interest)')

    ax.plot(np.concatenate([y_train, y_test]),
            color='green',
            label='Actual interest (0/1)')

    ax.axvspan(len(y_train), len(y_train) + len(y_test),
               alpha=0.3, color='lightgrey')

    plt.grid(True)
    plt.axis('tight')
    plt.legend(loc="best")
    plt.xlabel('Time index (windowed)')
    plt.ylabel('Interest in target CUSIP (probability / 0-1)')
    plt.title(f'Account propensity prediction ({name})')
    plt.show()


In [None]:
# =========================
# Load / prepare account data
# =========================
# ---- YOU need to provide `client_cusip_binary` and choose account + target CUSIP ----
#
# We assume you already created a dictionary elsewhere like:
#   client_cusip_binary[account_id] = DataFrame
#       index  = timestamps (sorted)
#       columns = CUSIP IDs (strings or ints)
#       values = 0/1 binary interaction indicators
#
# Here we just plug that in.

# Example placeholders (replace these with actual objects in your environment):

# client_cusip_binary = pickle.load(open('client_cusip_binary.pkl', 'rb'))
# ACCOUNT_ID = 'some_account_id'
# TARGET_CUSIP_ID = 'some_cusip'

# ---- START of the user-specific block ----
ACCOUNT_ID = 'REPLACE_ME_ACCOUNT'
TARGET_CUSIP_ID = 'REPLACE_ME_CUSIP'

# This must exist in your environment:
# client_cusip_binary = {account_id: df_account, ...}
# Each df_account is a DataFrame index=dates, columns=cusips, values={0,1}
df_account = client_cusip_binary[ACCOUNT_ID].copy()

# Sort by time just to be safe
df_account = df_account.sort_index()

# Make sure target CUSIP is present
assert TARGET_CUSIP_ID in df_account.columns, "Target CUSIP not in account's CUSIP list."

cusip_list = df_account.columns.tolist()
target_cusip_idx = cusip_list.index(TARGET_CUSIP_ID)

# Convert to numpy
X_account = df_account.values.astype(float)  # shape (T, M)
T, M = X_account.shape
print(f"Account {ACCOUNT_ID}: T={T}, M={M} (CUSIPs)")


In [None]:
# =========================
# Create windows and labels for this account + target CUSIP
# =========================

h_window = 36  # same order as in Bitcoin example; tune later if needed

X_window, y = build_account_windows(X_account, target_cusip_idx, h_window=h_window)

print(f"Shapes: X_window = {X_window.shape}, y = {y.shape}")
# X_window: (N, h_window, M)
# y: (N,)


In [None]:
# =========================
# Transform paths for signature kernels
# =========================
# We follow the same pattern as the BTC notebook:
#    - convert to torch tensor
#    - use sigkernel.transform to add time and lead-lag if desired
#      (this function can handle multivariate paths of shape (N, L, dim)).

# convert to torch
X_window_torch = torch.tensor(X_window, dtype=torch.float64)      # shape (N, L, M)

# apply augmentation: time + lead-lag (same arguments as original notebook)
X_window_torch = sigkernel.transform(X_window_torch, at=True, ll=True, scale=1e-5)

# train / test split (no shuffle because it's time series)
x_train, x_test, y_train, y_test = train_test_split(
    X_window_torch.numpy(), y, test_size=0.2, shuffle=False
)

x_train = torch.tensor(x_train, dtype=torch.float64, device='cpu')
x_test = torch.tensor(x_test, dtype=torch.float64, device='cpu')

y_train = np.array(y_train, dtype=float).reshape(-1)
y_test = np.array(y_test, dtype=float).reshape(-1)

print("After split:")
print("  x_train:", x_train.shape)
print("  x_test :", x_test.shape)
print("  y_train:", y_train.shape)
print("  y_test :", y_test.shape)


In [None]:
# =========================
# Training phase: hyperparameters for SVR
# =========================

svr_parameters = {
    'C': np.logspace(0, 4, 5),
    'gamma': np.logspace(-4, 4, 9)
}


In [None]:
# =========================
# Training: truncated signature features + SVR
# =========================

best_error_sig = 1e8

# Depth, scale, kernel, normalization search as before
for depth in tqdm([2, 3, 4, 5, 6], desc="Signature depth"):
    for scale in tqdm([1., 2., 3., 4., 5.], leave=False, desc="Scale"):
        for ker in tqdm(['linear', 'rbf'], leave=False, desc="SVR kernel"):
            for normalize in tqdm([True, False], leave=False, desc="Normalize?"):

                # Truncated signatures on the transformed paths
                # x_train: shape (N_train, L_aug, dim_aug)
                sig_train = iisignature.sig(scale * x_train.numpy(), depth)

                # normalization
                if normalize:
                    sig_train = sigkernel.normalize(
                        sig_train,
                        x_train.shape[-1],  # dim of path
                        depth
                    )

                # fit the model
                svr = SVR(kernel=ker)
                svr_sig = GridSearchCV(
                    estimator=svr,
                    param_grid=svr_parameters,
                    cv=5,
                    n_jobs=-1
                )
                svr_sig.fit(sig_train, y_train)

                # select best model (criterion: R^2 close to 1)
                if np.abs(1. - svr_sig.best_score_) < np.abs(1. - best_error_sig):
                    best_sig_model = svr_sig
                    best_error_sig = svr_sig.best_score_
                    best_depth_sig = depth
                    best_scale_sig = scale
                    best_ker_sig = ker
                    normalize_sig = normalize

print("Best truncated-signature model:")
print(f"  depth={best_depth_sig}, scale={best_scale_sig}, kernel={best_ker_sig}, normalize={normalize_sig}")
print(f"  CV R^2 = {best_error_sig:.4f}")


In [None]:
# =========================
# Training: signature PDE kernel + SVR (precomputed kernel)
# =========================

best_error_pde = 1e8

for sigma in tqdm([5e-2, 1e-1, 2.5e-1, 5e-1, 7.5e-1, 1.], desc="Sigma (RBF static kernel)"):

    # Specify the static kernel
    static_kernel = sigkernel.RBFKernel(sigma=sigma)

    # Initialize the corresponding signature kernel
    signature_kernel = sigkernel.SigKernel(static_kernel, dyadic_order=0)

    # Gram matrix train
    G_train = signature_kernel.compute_Gram(x_train, x_train, sym=True).numpy()

    # fit the model
    svr = SVR(kernel='precomputed')
    svr_pde = GridSearchCV(
        estimator=svr,
        param_grid=svr_parameters,
        cv=5,
        n_jobs=-1
    )
    svr_pde.fit(G_train, y_train)

    if np.abs(1. - svr_pde.best_score_) < np.abs(1. - best_error_pde):
        best_pde_model = svr_pde
        best_error_pde = svr_pde.best_score_
        best_sigma = sigma

print("Best PDE-signature model:")
print(f"  sigma={best_sigma}, CV R^2 = {best_error_pde:.4f}")


In [None]:
# =========================
# Testing phase
# =========================

kernels = ['rbf', 'sig', 'sig_PDE']  # I dropped 'gak' because we never defined it in this notebook

final = {}


In [None]:
# =========================
# Evaluation on test set
# =========================

for ker in tqdm(kernels):

    if ker == 'sig_PDE':

        # Rebuild signature kernel with the best sigma
        static_kernel = sigkernel.RBFKernel(sigma=best_sigma)
        signature_kernel = sigkernel.SigKernel(static_kernel, dyadic_order=0)

        # Gram matrices
        G_train = signature_kernel.compute_Gram(x_train, x_train, sym=True).numpy()
        G_test = signature_kernel.compute_Gram(x_test, x_train, sym=False).numpy()

        # predict
        y_train_predict = best_pde_model.predict(G_train)
        y_test_predict = best_pde_model.predict(G_test)

        # Clip to [0,1] to interpret as probabilities if you wish
        y_train_predict_clipped = np.clip(y_train_predict, 0.0, 1.0)
        y_test_predict_clipped = np.clip(y_test_predict, 0.0, 1.0)

        # calculate errors (MAE-style) and AUC
        p_error_test = mean_absolute_percentage_error(y_test, y_test_predict_clipped)
        try:
            auc_test = roc_auc_score(y_test, y_test_predict_clipped)
        except ValueError:
            auc_test = np.nan  # if only one class appears in test

        final[(ker, 'MAE')] = p_error_test
        final[(ker, 'AUC')] = auc_test

    elif ker == 'sig':

        # truncated signatures for train and test
        sig_train = iisignature.sig(best_scale_sig * x_train.numpy(), best_depth_sig)
        sig_test = iisignature.sig(best_scale_sig * x_test.numpy(), best_depth_sig)

        # normalization
        if normalize_sig:
            sig_train = sigkernel.normalize(sig_train, x_train.shape[-1], best_depth_sig)
            sig_test = sigkernel.normalize(sig_test, x_test.shape[-1], best_depth_sig)

        # predict
        y_train_predict = best_sig_model.predict(sig_train)
        y_test_predict = best_sig_model.predict(sig_test)

        y_train_predict_clipped = np.clip(y_train_predict, 0.0, 1.0)
        y_test_predict_clipped = np.clip(y_test_predict, 0.0, 1.0)

        # calculate errors and AUC
        p_error_test = mean_absolute_percentage_error(y_test, y_test_predict_clipped)
        try:
            auc_test = roc_auc_score(y_test, y_test_predict_clipped)
        except ValueError:
            auc_test = np.nan

        final[(ker, f'depth_{best_depth_sig}', f'ker_{best_ker_sig}', 'MAE')] = p_error_test
        final[(ker, f'depth_{best_depth_sig}', f'ker_{best_ker_sig}', 'AUC')] = auc_test

    elif ker == 'rbf':
        # Optionally: a simple baseline SVR on raw (flattened) windows or something else.
        # For minimal changes, we can skip or implement a very simple baseline.

        # Example: flatten the augmented path and fit a vanilla RBF SVR
        X_flat_train = x_train.numpy().reshape(x_train.shape[0], -1)
        X_flat_test = x_test.numpy().reshape(x_test.shape[0], -1)

        svr_rbf = GridSearchCV(
            estimator=SVR(kernel='rbf'),
            param_grid=svr_parameters,
            cv=5,
            n_jobs=-1
        )
        svr_rbf.fit(X_flat_train, y_train)

        y_test_predict = svr_rbf.predict(X_flat_test)
        y_test_predict_clipped = np.clip(y_test_predict, 0.0, 1.0)

        p_error_test = mean_absolute_percentage_error(y_test, y_test_predict_clipped)
        try:
            auc_test = roc_auc_score(y_test, y_test_predict_clipped)
        except ValueError:
            auc_test = np.nan

        final[(ker, 'MAE')] = p_error_test
        final[(ker, 'AUC')] = auc_test

# Save results
with open('../results/account_propensity_results.pkl', 'wb') as file:
    pickle.dump(final, file)


In [None]:
# =========================
# Print results
# =========================

with open('../results/account_propensity_results.pkl', 'rb') as file:
    final_loaded = pickle.load(file)

print("Final test errors and AUCs (per kernel):")
for k, v in final_loaded.items():
    print(k, ":", v)
