In [10]:
import os

import numpy as np
import pandas as pd
import sklearn.model_selection

np.random.seed(1)

In [11]:
# Generate data
N_samples = 10000
N_features = 2
N_covariates = 2
X = np.random.randn(N_samples, N_covariates)
Y = np.random.randn(N_samples, N_features)
sites = np.random.choice(a=3, size=N_samples, replace=True) + 1
sex = np.random.choice(a=2, size=N_samples, replace=True) + 1
batch_effects = np.stack([sites, sex], axis=1)

# Split data into original and transfer
original_data_mask = batch_effects[:, 0] != 3
X_or = X[original_data_mask]
Y_or = Y[original_data_mask]
be_or = batch_effects[original_data_mask]

transfer_data_mask = batch_effects[:, 0] == 3
X_tr = X[transfer_data_mask]
Y_tr = Y[transfer_data_mask]
be_tr = batch_effects[transfer_data_mask]

# Split into train and test sets
X_tr_or, X_ts_or, Y_tr_or, Y_ts_or, be_tr_or, be_ts_or = (
    sklearn.model_selection.train_test_split(
        X_or, Y_or, be_or, test_size=0.2, random_state=1, stratify=be_or[:, 1]
    )
)

X_tr_tr, X_ts_tr, Y_tr_tr, Y_ts_tr, be_tr_tr, be_ts_tr = (
    sklearn.model_selection.train_test_split(
        X_tr, Y_tr, be_tr, test_size=0.2, random_state=1, stratify=be_tr[:, 1]
    )
)


In [12]:
import pickle

# Save to file
tempdir = "/Users/stijndeboer/temp"
os.makedirs(tempdir, exist_ok=True)

# Original data
with open(os.path.join(tempdir, "X_tr_or.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(X_tr_or), f)
with open(os.path.join(tempdir, "Y_tr_or.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(Y_tr_or), f)
with open(os.path.join(tempdir, "be_tr_or.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(be_tr_or), f)
with open(os.path.join(tempdir, "X_ts_or.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(X_ts_or), f)
with open(os.path.join(tempdir, "Y_ts_or.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(Y_ts_or), f)
with open(os.path.join(tempdir, "be_ts_or.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(be_ts_or), f)

# Transfer data
with open(os.path.join(tempdir, "X_tr_tr.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(X_tr_tr), f)
with open(os.path.join(tempdir, "Y_tr_tr.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(Y_tr_tr), f)
with open(os.path.join(tempdir, "be_tr_tr.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(be_tr_tr), f)
with open(os.path.join(tempdir, "X_ts_tr.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(X_ts_tr), f)
with open(os.path.join(tempdir, "Y_ts_tr.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(Y_ts_tr), f)
with open(os.path.join(tempdir, "be_ts_tr.pkl"), "wb") as f:
    pickle.dump(pd.DataFrame(be_ts_tr), f)


In [13]:
import pcntoolkit as ptk

scaler = "minmax"

ptk.normative.estimate(
    covfile=os.path.join(tempdir, "X_tr_or.pkl"),
    respfile=os.path.join(tempdir, "Y_tr_or.pkl"),
    trbefile=os.path.join(tempdir, "be_tr_or.pkl"),
    testcov=os.path.join(tempdir, "X_ts_or.pkl"),
    testresp=os.path.join(tempdir, "Y_ts_or.pkl"),
    tsbefile=os.path.join(tempdir, "be_ts_or.pkl"),
    inscaler=scaler,
    outscaler=scaler,
    savemodel=True,
    alg="hbr",
)


inscaler: minmax
outscaler: minmax
Processing data in /Users/stijndeboer/temp/Y_tr_or.pkl
Estimating model  1 of 2


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag_grad...
Sequential sampling (1 chains in 1 job)
NUTS: [mu_slope_mu, sigma_slope_mu, offset_slope_mu, mu_intercept_mu, sigma_intercept_mu, offset_intercept_mu, mu_sigma, sigma_sigma, sigma]


Output()

Sampling 1 chain for 500 tune and 1_000 draw iterations (500 + 1_000 draws total) took 71 seconds.
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sampling: [y_like]


Output()

Sampling: [y_like]


Output()

Normal


Estimating model  2 of 2


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag_grad...
Sequential sampling (1 chains in 1 job)
NUTS: [mu_slope_mu, sigma_slope_mu, offset_slope_mu, mu_intercept_mu, sigma_intercept_mu, offset_intercept_mu, mu_sigma, sigma_sigma, sigma]


Output()

Sampling 1 chain for 500 tune and 1_000 draw iterations (500 + 1_000 draws total) took 31 seconds.
There were 198 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sampling: [y_like]


Output()

Sampling: [y_like]


Output()

Normal


Saving model meta-data...
Evaluating the model ...
Writing outputs ...


In [14]:
ptk.normative.transfer(
    covfile=os.path.join(tempdir, "X_tr_tr.pkl"),
    respfile=os.path.join(tempdir, "Y_tr_tr.pkl"),
    trbefile=os.path.join(tempdir, "be_tr_tr.pkl"),
    testcov=os.path.join(tempdir, "X_ts_tr.pkl"),
    testresp=os.path.join(tempdir, "Y_ts_tr.pkl"),
    tsbefile=os.path.join(tempdir, "be_ts_tr.pkl"),
    alg="hbr",
    inscaler=scaler,
    outscaler=scaler,
    model_path="/Users/stijndeboer/Projects/PCN/PCNtoolkit/tests/Models",
    output_path="/Users/stijndeboer/Projects/PCN/PCNtoolkit/tests/Models/transfer",
    outputsuffix="_transfer",
)


Loading data ...
Using HBR transform...
Transferring model  1 of 2


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag_grad...
Sequential sampling (1 chains in 1 job)
NUTS: [mu_slope_mu, sigma_slope_mu, offset_slope_mu, mu_intercept_mu, sigma_intercept_mu, offset_intercept_mu, mu_sigma, sigma_sigma, sigma]


Output()

Sampling 1 chain for 500 tune and 1_000 draw iterations (500 + 1_000 draws total) took 14 seconds.
There were 16 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sampling: [y_like]


Output()

Using HBR transform...
Transferring model  2 of 2


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag_grad...
Sequential sampling (1 chains in 1 job)
NUTS: [mu_slope_mu, sigma_slope_mu, offset_slope_mu, mu_intercept_mu, sigma_intercept_mu, offset_intercept_mu, mu_sigma, sigma_sigma, sigma]


Output()

Sampling 1 chain for 500 tune and 1_000 draw iterations (500 + 1_000 draws total) took 12 seconds.
There were 10 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sampling: [y_like]


Output()

Evaluating the model ...
Writing outputs ...


(array([[-0.02886982, -0.03568566],
        [-0.03630813,  0.0236544 ],
        [ 0.06204489, -0.00776436],
        ...,
        [ 0.07300818, -0.02742723],
        [-0.05786122, -0.07673008],
        [-0.04668915, -0.01009208]]),
 array([[1.04617948, 0.92042474],
        [0.98979401, 0.92901113],
        [1.07487888, 0.97092527],
        ...,
        [1.00533863, 1.01534088],
        [0.99352799, 1.04575208],
        [1.02937279, 1.00084325]]),
 array([[ 1.29090397e+00, -2.48072835e-01],
        [-1.25139043e+00, -5.92822900e-01],
        [-1.10995884e-01, -1.29110422e-01],
        ...,
        [-1.59678931e+00, -1.18325424e-04],
        [-7.92115492e-01,  1.04373453e-01],
        [-2.08458127e+00, -1.49754190e-01]]))

In [16]:
# Load the original metadata
with open(os.path.join("Models", "meta_data.md"), "rb") as f:
    meta_data = pickle.load(f)

print(meta_data)

# Load the transfer metadata
with open(os.path.join("Models", "transfer", "meta_data.md"), "rb") as f:
    meta_data_transfer = pickle.load(f)

print(meta_data_transfer)


{'valid_voxels': array([0, 1]), 'fold_num': 1, 'mean_resp': [array([0.01032056, 0.00951302])], 'std_resp': [array([1.01259045, 1.00453476])], 'scaler_cov': [<pcntoolkit.util.utils.scaler object at 0x30be80a10>], 'scaler_resp': [<pcntoolkit.util.utils.scaler object at 0x30bd585c0>], 'regressor': 'hbr', 'inscaler': 'minmax', 'outscaler': 'minmax', 'versions': {'Python': '3.12.0', 'pytensor': '2.26.3', 'PyMC': '5.18.2', 'PCNtoolkit': ''}}
{'valid_voxels': array([0, 1]), 'fold_num': 1, 'mean_resp': [array([0.01032056, 0.00951302])], 'std_resp': [array([1.01259045, 1.00453476])], 'scaler_cov': <pcntoolkit.util.utils.scaler object at 0x30432e600>, 'scaler_resp': <pcntoolkit.util.utils.scaler object at 0x30bfb7740>, 'regressor': 'hbr', 'inscaler': 'minmax', 'outscaler': 'minmax', 'versions': {'Python': '3.12.0', 'pytensor': '2.26.3', 'PyMC': '5.18.2', 'PCNtoolkit': ''}}


In [18]:
# Scale the train data using the original scalers
X_tr_or_scaled = meta_data["scaler_cov"][0].transform(X_tr_or)
Y_tr_or_scaled = meta_data["scaler_resp"][0].transform(Y_tr_or)

# Scale the combined train data using the transfer scalers
X_all_scaled = meta_data_transfer["scaler_cov"].transform(
    np.concatenate([X_tr_or, X_tr_tr], axis=0)
)
Y_all_scaled = meta_data_transfer["scaler_resp"].transform(
    np.concatenate([Y_tr_or, Y_tr_tr], axis=0)
)


In [19]:
atol = 1e-3
zeros = np.zeros(X_tr_or_scaled.shape[1])
ones = np.ones(X_tr_or_scaled.shape[1])
if scaler == "standardize":
    print(np.allclose(X_tr_or_scaled.mean(axis=0), zeros, atol=atol))
    print(np.allclose(X_tr_or_scaled.std(axis=0), ones, atol=atol))
    print(np.allclose(Y_tr_or_scaled.mean(axis=0), zeros, atol=atol))
    print(np.allclose(Y_tr_or_scaled.std(axis=0), ones, atol=atol))

    print(np.allclose(X_all_scaled.mean(axis=0), zeros, atol=atol))
    print(np.allclose(X_all_scaled.std(axis=0), ones, atol=atol))
    print(np.allclose(Y_all_scaled.mean(axis=0), zeros, atol=atol))
    print(np.allclose(Y_all_scaled.std(axis=0), ones, atol=atol))
elif scaler == "minmax":
    print(np.allclose(X_tr_or_scaled.min(axis=0), zeros, atol=atol))
    print(np.allclose(X_tr_or_scaled.max(axis=0), ones, atol=atol))
    print(np.allclose(Y_tr_or_scaled.min(axis=0), zeros, atol=atol))
    print(np.allclose(Y_tr_or_scaled.max(axis=0), ones, atol=atol))

    print(np.allclose(X_all_scaled.min(axis=0), zeros, atol=atol))
    print(np.allclose(X_all_scaled.max(axis=0), ones, atol=atol))
    print(np.allclose(Y_all_scaled.min(axis=0), zeros, atol=atol))
    print(np.allclose(Y_all_scaled.max(axis=0), ones, atol=atol))


True
True
True
True
True
True
True
True
