In [None]:
import sys
from joblib import Parallel, delayed
import pandas as pd
import numpy as np

#sys.path.append("../")
from helpers.trainer import train_HSIC_IV
from models.hsicx import LinearHSICX
from models.kernel import RBFKernel, CmeangoryKernel
from helpers.utils import to_torch, med_sigma


In [2]:
config = {'batch_size': 256, 'lr': 1e-2,
               'max_epoch': 700, 'num_restart': 4}

In [None]:
# Paths for datasets
datasets = {
    "1000": {
        "train": '../../../data_sim/results/train_Zbin_g_mult_g_log_f_lin1000.csv',
        "test": '../../../data_sim/results/test_Zbin_g_mult_g_log_f_lin1000.csv',
    },
    "10000": {
        "train": '../../../data_sim/results/train_Zbin_g_mult_g_log_f_lin10000.csv',
        "test": '../../../data_sim/results/test_Zbin_g_mult_g_log_f_lin10000.csv',
    }
}

instrument = 'Binary'  # 'Binary' or 'Continuous'

def run_single_rep(i, X, Y, Z, X_test, X_test_grid, config):
    # HSIC IV model setup
    s_z = med_sigma(Z)
    kernel_e = RBFKernel(sigma=1)
    kernel_z = CategoryKernel() if instrument == 'Binary' else RBFKernel(sigma=s_z)

    # Train HSIC IV model
    hsic_net = LinearHSICX(input_dim=2, lr=config['lr'], lmd=0.0, kernel_e=kernel_e, kernel_z=kernel_z, bias=False)
    hsic_net = train_HSIC_IV(hsic_net, config, X, Y, Z, verbose=True)

    intercept_adjust = Y.mean() - hsic_net(to_torch(X)).mean()
    y_hat_hsic = intercept_adjust + hsic_net(to_torch(X_test))
    y_hat_hsic_grid = intercept_adjust + hsic_net(to_torch(X_test_grid))

    return (
        y_hat_hsic.detach().numpy().copy(),
        y_hat_hsic_grid.detach().numpy().copy(),
        hsic_net.layers[0].weight.data.flatten().tolist()
    )

for size, paths in datasets.items():
    # Load train and test data
    train_data = np.genfromtxt(paths['train'], delimiter=',', skip_header=1)
    test_data = np.genfromtxt(paths['test'], delimiter=',', skip_header=1)

    # Prepare data
    Z, X, Y = train_data[:, 0], train_data[:, 1:3], train_data[:, 3]
    X_test = test_data[:, 0:2].astype(np.float32)
    X_test_grid = test_data[:, 3:5].astype(np.float32)

    # DataFrames to store results
    df_mse = pd.DataFrame()
    df_beta = pd.DataFrame()

    # Train MSE model to get initial coefficients
    # mse_reg = PredPolyRidge(degree=1, bias=False)
    # mse_reg.fit(X, Y)
    # mse_coef = mse_reg.reg.coef_

    # Parallelize 10 repetitions
    results = Parallel(n_jobs=10)(delayed(run_single_rep)(
        i, X, Y, Z, X_test, X_test_grid, config
    ) for i in range(10))

    # Combine results into DataFrames
    df_mse = pd.DataFrame({f'Run_{i+1}': result[0] for i, result in enumerate(results)})
    df_beta = pd.DataFrame({f'Run_{i+1}': result[2] for i, result in enumerate(results)})

    # Save results for this dataset size
    # df_mse.to_csv(f'../output_data/hsic_lin_result_mse_UI_{size}.csv', index=False)
    df_beta.to_csv(f'../output_data/hsic_lin_result_beta_UI_{size}.csv', index=False)

    print(f"Finished processing dataset size {size}.")