# Test Runner


### Imports

In [2]:
import os

# os.environ["OPENBLAS_NUM_THREADS"] = "1"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pcntoolkit.dataio.norm_data import NormData
from pcntoolkit.normative_model.norm_conf import NormConf
from pcntoolkit.normative_model.norm_blr import NormBLR
from pcntoolkit.regression_model.blr.blr_conf import BLRConf
from pcntoolkit.normative_model.norm_factory import load_normative_model
from pcntoolkit.normative_model.norm_factory import create_normative_model
from pcntoolkit.regression_model.hbr.hbr_conf import HBRConf
from pcntoolkit.runner import Runner
from pcntoolkit.regression_model.hbr.param import Param

import seaborn as sns
import arviz as az


# Load data

First we download a small example dataset from github. Saving this dataset on your local device (under 'resources/data/fcon1000.csv' for example) saves time and bandwidth if you re-run this notebook.

First we download a small example dataset from github. Saving this dataset on your local device (under 'resources/data/fcon1000.csv' for example) saves time and bandwidth if you re-run this notebook.

In [3]:
# First download the dataset from github
# fcon=pd.read_csv("https://raw.githubusercontent.com/pcn-toolkit/pcn-toolkit/master/resources/data/fcon1000.csv")
data = pd.read_csv("resources/data/fcon1000.csv")

Plot the distribution of sex and site in the data.

In [4]:
data["sex "] = np.where(data["sex"] == 1, ["male"], ["female"])

Our HBR models will use random effects to model differences between sites. Because the random effects are best captured when there are enough samples of each effect in the data, we will have to remove some sites that are too small. We will filter out sites for which any of the sexes is represented by less than 10 samples

In [5]:
# Group the data by site and sex
site_counts = data.groupby(["site", "sex"]).size().reset_index(name="counts")  # type: ignore

# Get the sites with only one sex present
sex_count_per_site = site_counts["site"].value_counts()
sites_with_one_sex = sex_count_per_site[sex_count_per_site == 1]
sites_with_one_sex.index

# remove the sites with less than 10 samples
data = data[~data["site"].isin(sites_with_one_sex.index)]


# find the sites that have less than 10 samples
site_counts = site_counts[site_counts["counts"] < 10]

# remove the sites with less than 10 samples
data = data[~data["site"].isin(site_counts["site"])]


We find the seven largest sites, which we will use for train and transfer. Two of those are randomly selected for transfering later.

In [6]:
site_counts = data.groupby(["site"]).size().reset_index(name="counts")  # type: ignore
site_counts = site_counts.sort_values("counts", ascending=False)
site_counts = site_counts.head(7)

np.random.seed(45)
# randomly select 2 sites from the top 7 sites for transfering
transfer_sites = site_counts.sample(2)["site"]
transfer_data = data[data["site"].isin(transfer_sites)]

# The remaining sites are used for training the model
fit_sites = site_counts[~site_counts.isin(transfer_sites)]["site"]
fit_sites.dropna(inplace=True)
fit_data = data[data["site"].isin(fit_sites)]

Next, we load the data into `NormData` objects. All functions in the PCNtoolkit expect the data to be provided as instances of the `NormData` class. The class manages all preprocessing, basis expansions, and dimensions. 

In [7]:
covariates = ["age"]
batch_effects = ["sex", "site"]
response_vars = ["rh_MeanThickness_thickness", "WM-hypointensities"]

# Create a normdata object from the downloaded data
normdata = NormData.from_dataframe(
    name="fit",  # name of the dataset
    dataframe=fit_data,  # pandas dataframe
    covariates=covariates,
    batch_effects=batch_effects,
    response_vars=response_vars,
)

# Create a transfer data object from the downloaded data
transfer_data = NormData.from_dataframe(
    name="transfer",
    dataframe=transfer_data,
    covariates=covariates,
    batch_effects=batch_effects,
    response_vars=response_vars,
)

fit_data, predict_data = normdata.train_test_split(splits=(0.8, 0.2))


## Configure the normative model

The normative model will be configured using a `NormConf` object, containing save and log paths and the preprocessing configurations, and a `RegConf` object, specific to the regression model type. Our `NormConf` configuration contains canonical paths, a standardization step for both the input as as the output data, and a Bspline basis expansion.

In [8]:
# Create a NormConf object
norm_conf = NormConf(
    savemodel=True,
    saveresults=True,
    # save_dir="/project/3022000.05/projects/stijdboe/wdir/save_dir",
    save_dir="/Users/stijndeboer/Projects/PCN/PCNtoolkit/example_notebooks/save_dir",
    inscaler="none",
    outscaler="none",
    basis_function="bspline",
    order=3,
    nknots=5,
)

Configuration of normative model is valid.


## Configure the regression model

HBR models need to specificy (possibly recursive) parameter configurations. Here, we configure a HBR model with a SHASHb likelihood, a bspline regression in `mu` and `sigma`, and a random effect in the intercept of `mu`. Note that because sigma has to be strictly positive, we specify a `softplus` mapping, so that the output of the linear regression is mapped to the positive domain. 

In [9]:
blr_conf = BLRConf(
    intercept=True,
    random_intercept=False,
    heteroskedastic=False,
    intercept_var=False,
    n_iter=1000,
)

Configuration of regression model is valid.


## Combine normative and hbr conf in normative model
We can either use the NormHBR constructor, or the factory method to create a normative HBR model

In [10]:
# Using the constructor
norm_blr = NormBLR(norm_conf=norm_conf, reg_conf=blr_conf)

## Create a runner

In [11]:
runner = Runner(
    norm_blr,
    cross_validate=False,
    cv_folds=10,
    parallelize=False,
    job_type="local",
    n_jobs=2,
    # log_dir="/project/3022000.05/projects/stijdboe/wdir/log_dir",
    # temp_dir="/project/3022000.05/projects/stijdboe/wdir/temp_dir",
    log_dir="/Users/stijndeboer/Projects/PCN/PCNtoolkit/example_notebooks/log_dir",
    temp_dir="/Users/stijndeboer/Projects/PCN/PCNtoolkit/example_notebooks/temp_dir",
)

No python path specified. Using interpreter path of current process: /opt/anaconda3/envs/ptk_dev/bin/python


In [12]:
runner.fit_predict(fit_data, predict_data)

Going to fit and predict 2 models
Fitting and predicting model for rh_MeanThickness_thickness
Fitting and predicting model for WM-hypointensities
93103 Saving model to /Users/stijndeboer/Projects/PCN/PCNtoolkit/example_notebooks/save_dir
Computing zscores for rh_MeanThickness_thickness
Computing zscores for WM-hypointensities
Computing centiles for rh_MeanThickness_thickness
Computing centiles for WM-hypointensities
Computing centiles for rh_MeanThickness_thickness
Computing centiles for WM-hypointensities


  nll = -np.mean(y * np.log(yhat) + (1 - y) * np.log(1 - yhat))



All jobs completed!


In [13]:
new_nm = load_normative_model(os.path.join(norm_conf.save_dir, "folds", "fold_0"))


FileNotFoundError: Path /Users/stijndeboer/Projects/PCN/PCNtoolkit/example_notebooks/save_dir/folds/fold_0 does not exist.

In [14]:
print(os.listdir(os.path.join(norm_conf.save_dir, "folds", "fold_0")))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/stijndeboer/Projects/PCN/PCNtoolkit/example_notebooks/save_dir/folds/fold_0'

In [25]:
print(norm_conf.save_dir)

/project/3022000.05/projects/stijdboe/wdir/save_dir
