In [1]:
import afqinsight.nn.tf_models as nn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from afqinsight.datasets import AFQDataset
from sklearn.impute import SimpleImputer
import os.path
# Harmonization
from sklearn.model_selection import train_test_split
from neurocombat_sklearn import CombatModel
import pickle

from xgboost import XGBRegressor
from skopt import BayesSearchCV


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
afq_dataset = AFQDataset.from_files(
    fn_nodes="../data/raw/combined_tract_profiles.csv",
    fn_subjects="../data/raw/participants_updated_id.csv",
    dwi_metrics=["dki_fa", "dki_md", "dki_mk"],
    index_col="subject_id",
    target_cols=["age", "dl_qc_score", "scan_site_id"],
    label_encode_cols=["scan_site_id"]
)

In [3]:
afq_dataset.drop_target_na()

In [4]:
print(len(afq_dataset.subjects))
print(afq_dataset.X.shape)
print(afq_dataset.y.shape)

1865
(1865, 7200)
(1865, 3)


In [5]:
full_dataset = list(afq_dataset.as_tensorflow_dataset().as_numpy_iterator())

In [6]:
X = np.concatenate([xx[0][None] for xx in full_dataset], 0)
y = np.array([yy[1][0] for yy in full_dataset])
qc = np.array([yy[1][1] for yy in full_dataset])
site = np.array([yy[1][2] for yy in full_dataset])

In [7]:
X = X[qc>0]
y = y[qc>0]
site = site[qc>0]

In [8]:
n_epochs = 1000

# EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    mode="min",
    patience=100
)

# ReduceLROnPlateau
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=20,
    verbose=1,
)

In [9]:
from afqinsight.augmentation import jitter, time_warp, scaling

In [10]:
def augment_this(X, y, rounds=2): 
    new_X = X[:]
    new_y = y[:]
    for f in range(rounds): 
        aug_X = np.zeros_like(X)
        # Do each channel separately:
        for channel in range(aug_X.shape[-1]):
            this_X = X[..., channel][..., np.newaxis]
            this_X = jitter(this_X, sigma=np.mean(this_X)/25)
            this_X = scaling(this_X, sigma=np.mean(this_X)/25)
            this_X = time_warp(this_X, sigma=np.mean(this_X)/25)
            aug_X[..., channel] = this_X[...,0]
        new_X = np.concatenate([new_X, aug_X])
        new_y = np.concatenate([new_y, y])
    return new_X, new_y 

In [11]:
from sklearn.utils import shuffle, resample

In [12]:
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

In [13]:
# Generate evaluation results, training history, number of epochs
def fit_model(X, y, random_state, augment=True):
    # Split the data into train and test sets:
    X_train, X_test, y_train, y_test, site_train, site_test = train_test_split(X, y, site, test_size=0.2, random_state=random_state)
    imputer = SimpleImputer(strategy="median")
    # Impute train and test separately:
    X_train = np.concatenate([imputer.fit_transform(X_train[..., ii])[:, :, None] for ii in range(X_train.shape[-1])], -1)
    X_test = np.concatenate([imputer.fit_transform(X_test[..., ii])[:, :, None] for ii in range(X_test.shape[-1])], -1)
    # Combat
    X_train = np.concatenate([CombatModel().fit_transform(X_train[..., ii], site_train[:, None], None, None)[:, :, None] for ii in range(X_train.shape[-1])], -1)
    X_test = np.concatenate([CombatModel().fit_transform(X_test[..., ii], site_test[:, None], None, None)[:, :, None] for ii in range(X_test.shape[-1])], -1)
    
    # Reshape for xgboost
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    
    params = {
    "n_estimators": (50, 2001),
    "min_child_weight": (1, 11),
    "gamma": (0.01, 5.0, "log-uniform"),
    "eta": (0.005, 0.5, "log-uniform"),
    "subsample": (0.2, 1.0),
    "colsample_bytree": (0.2, 1.0),
    "max_depth": (2, 6),
    }
    xgb = XGBRegressor(
        objective="reg:squarederror",
        nthread=32,
        verbosity=1,
    )
    opt = BayesSearchCV(
        xgb,
        params,
        n_iter=100,
    )
    if augment:
        X_train, y_train = augment_this(X_train, y_train, rounds=6)
        X_train, y_train = shuffle(X_train, y_train)
    
    _ = opt.fit(X_train, y_train)
        
    y_pred = opt.predict(X_test)    
    eval_model = [mean_absolute_error(y_test, y_pred), median_absolute_error(y_test, y_pred), r2_score(y_test, y_pred)]
    return eval_model

In [14]:
results = {}

In [15]:
n_runs = 10
augment=False

In [16]:
seeds = np.array([np.abs(np.floor(np.random.randn()*1000)) for ii in range(n_runs)], dtype=int)

In [17]:
import tempfile

In [None]:
print("##################################################")
results = []

for ii in range(n_runs): 
    print(ii)
    this_eval = fit_model(X, y, random_state=seeds[ii], augment=augment)
    results.append(this_eval)
    with open(f'results_xgboost.pickle', 'wb') as file:
        pickle.dump(results, file, protocol=pickle.HIGHEST_PROTOCOL)

##################################################
0


In [None]:
# debug