In [1]:
import afqinsight.nn.tf_models as nn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from afqinsight.datasets import AFQDataset
from afqinsight.nn.tf_models import cnn_lenet, mlp4, cnn_vgg, lstm1v0, lstm1, lstm2, blstm1, blstm2, lstm_fcn, cnn_resnet
from sklearn.impute import SimpleImputer
import os.path
# Harmonization
from sklearn.model_selection import train_test_split
from neurocombat_sklearn import CombatModel
import pandas as pd
from sklearn.utils import shuffle, resample
from afqinsight.augmentation import jitter, time_warp, scaling
import tempfile

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
afq_dataset = AFQDataset.from_files(
    fn_nodes="../data/raw/combined_tract_profiles.csv",
    fn_subjects="../data/raw/participants_updated_id.csv",
    dwi_metrics=["dki_fa", "dki_md", "dki_mk"],
    index_col="subject_id",
    target_cols=["age", "dl_qc_score", "scan_site_id"],
    label_encode_cols=["scan_site_id"]
)

In [3]:
afq_dataset.drop_target_na()

In [4]:
print(len(afq_dataset.subjects))
print(afq_dataset.X.shape)
print(afq_dataset.y.shape)

1865
(1865, 7200)
(1865, 3)


In [5]:
full_dataset = list(afq_dataset.as_tensorflow_dataset().as_numpy_iterator())

In [6]:
X = np.concatenate([xx[0][None] for xx in full_dataset], 0)
y = np.array([yy[1][0] for yy in full_dataset])
qc = np.array([yy[1][1] for yy in full_dataset])
site = np.array([yy[1][2] for yy in full_dataset])

In [7]:
X = X[qc>0]
y = y[qc>0]
site = site[qc>0]

In [8]:
n_epochs = 1000

# EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    mode="min",
    patience=100
)

# ReduceLROnPlateau
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=20,
    verbose=1,
)

In [9]:
def augment_this(X, y, rounds=2): 
    new_X = X[:]
    new_y = y[:]
    for f in range(rounds): 
        aug_X = np.zeros_like(X)
        # Do each channel separately:
        for channel in range(aug_X.shape[-1]):
            this_X = X[..., channel][..., np.newaxis]
            this_X = jitter(this_X, sigma=np.mean(this_X)/25)
            this_X = scaling(this_X, sigma=np.mean(this_X)/25)
            this_X = time_warp(this_X, sigma=np.mean(this_X)/25)
            aug_X[..., channel] = this_X[...,0]
        new_X = np.concatenate([new_X, aug_X])
        new_y = np.concatenate([new_y, y])
    return new_X, new_y 

In [10]:
# Generate evaluation results and correlation coeffcients combined in a dataframe, and history
def single_cross_site(model_name, name_str, lr,
                      site_1, site_2, site_3, X, y):
    # Split the data by sites
    X_1 = X[site==site_1]
    y_1 = y[site==site_1]
    X_2 = X[site==site_2]
    y_2 = y[site==site_2]
    X_3 = X[site==site_3]
    y_3 = y[site==site_3]
    # Split the data into train and test sets:
    X_train, X_test1, y_train, y_test1 = train_test_split(X_1, y_1, test_size=0.2)
    _, X_test2, _, y_test2 = train_test_split(X_2, y_2, test_size=0.2)
    _, X_test3, _, y_test3 = train_test_split(X_3, y_3, test_size=0.2)
    imputer = SimpleImputer(strategy="median")
    # Impute train and test separately:
    X_train = np.concatenate([imputer.fit_transform(X_train[..., ii])[:, :, None] for ii in range(X_train.shape[-1])], -1)
    X_test1 = np.concatenate([imputer.fit_transform(X_test1[..., ii])[:, :, None] for ii in range(X_test1.shape[-1])], -1)
    X_test2 = np.concatenate([imputer.fit_transform(X_test2[..., ii])[:, :, None] for ii in range(X_test2.shape[-1])], -1)
    X_test3 = np.concatenate([imputer.fit_transform(X_test3[..., ii])[:, :, None] for ii in range(X_test3.shape[-1])], -1)
    model = model_name(input_shape=(100, 72), n_classes=1, output_activation=None, verbose=True)
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  metrics=['mean_squared_error', 
                           tf.keras.metrics.RootMeanSquaredError(name='rmse'), 
                           'mean_absolute_error'])
    # ModelCheckpoint
    ckpt_filepath = tempfile.NamedTemporaryFile().name + '.h5'
    ckpt = tf.keras.callbacks.ModelCheckpoint(
    filepath = ckpt_filepath,
    monitor="val_loss",
    verbose=1,
    save_best_only=True,
    save_weights_only=True,
    mode="auto",
    )
    # CSVLogger
    log = tf.keras.callbacks.CSVLogger(filename=(name_str + '.csv'), append=True)
    callbacks = [early_stopping, ckpt, reduce_lr, log]
    history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=128, validation_split=0.2,
                        callbacks=callbacks)
    model.load_weights(ckpt_filepath)
    y_predicted1 = model.predict(X_test1)
    y_predicted1 = y_predicted1.reshape(y_test1.shape)
    y_predicted2 = model.predict(X_test2)
    y_predicted2 = y_predicted2.reshape(y_test2.shape)
    y_predicted3 = model.predict(X_test3)
    y_predicted3 = y_predicted3.reshape(y_test3.shape)
    coef1 = np.corrcoef(y_test1, y_predicted1)[0,1] ** 2
    coef2 = np.corrcoef(y_test2, y_predicted2)[0,1] ** 2
    coef3 = np.corrcoef(y_test3, y_predicted3)[0,1] ** 2
    eval_1 = model.evaluate(X_test1, y_test1)
    eval_2 = model.evaluate(X_test2, y_test2)
    eval_3 = model.evaluate(X_test3, y_test3)
    result = {'Model': [name_str]*12,
              'Train_site': [site_1]*12,
              'Test_site': [site_1] * 4 + [site_2] * 4 + [site_3] * 4,
              'Metric': ['MSE', 'RMSE', 'MAE', 'coef'] * 3,
              'Value': [eval_1[1], eval_1[2], eval_1[3], coef1,
                        eval_2[1], eval_2[2], eval_2[3], coef2,
                        eval_3[1], eval_3[2], eval_3[3], coef3]}
    df = pd.DataFrame(result)
    return df, history

In [11]:
# Generate evaluation results and correlation coeffcients combined in a dataframe, and history
def double_cross_site(model_name, name_str, lr,
                      site_1, site_2, site_3, X, y):
    # Split the data by sites
    X_1 = X[site==site_1]
    y_1 = y[site==site_1]
    X_2 = X[site==site_2]
    y_2 = y[site==site_2]
    X_3 = X[site==site_3]
    y_3 = y[site==site_3]
    # Split the data into train and test sets:
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1, y_1, test_size=0.2)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, test_size=0.2)
    X_train3, X_test3, y_train3, y_test3 = train_test_split(X_3, y_3, test_size=0.2)
    imputer = SimpleImputer(strategy="median")
    # Impute train and test separately:
    X_train1 = np.concatenate([imputer.fit_transform(X_train1[..., ii])[:, :, None] for ii in range(X_train1.shape[-1])], -1)
    X_train2 = np.concatenate([imputer.fit_transform(X_train2[..., ii])[:, :, None] for ii in range(X_train2.shape[-1])], -1)
    X_test1 = np.concatenate([imputer.fit_transform(X_test1[..., ii])[:, :, None] for ii in range(X_test1.shape[-1])], -1)
    X_test2 = np.concatenate([imputer.fit_transform(X_test2[..., ii])[:, :, None] for ii in range(X_test2.shape[-1])], -1)
    X_test3 = np.concatenate([imputer.fit_transform(X_test3[..., ii])[:, :, None] for ii in range(X_test3.shape[-1])], -1)
    # size down evenly
    sample = y_test3.shape[0]//2
    sample1 = resample(X_train1, y_train1, n_samples=sample, replace=False)
    sample2 = resample(X_train2, y_train2, n_samples=sample, replace=False)
    X_train = np.concatenate((sample1[0], sample2[0]), axis=0)
    y_train = np.concatenate((sample1[1], sample2[1]), axis=0)
    # shuffle
    X_train, y_train = shuffle(X_train, y_train)
    X_train, y_train = augment_this(X_train, y_train)
    X_train, y_train = shuffle(X_train, y_train)
    model = model_name(input_shape=(100, 72), n_classes=1, output_activation=None, verbose=True)
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  metrics=['mean_squared_error', 
                           tf.keras.metrics.RootMeanSquaredError(name='rmse'), 
                           'mean_absolute_error'])
    # ModelCheckpoint
    ckpt_filepath = tempfile.NamedTemporaryFile().name + '.h5'
    ckpt = tf.keras.callbacks.ModelCheckpoint(
    filepath = ckpt_filepath,
    monitor="val_loss",
    verbose=1,
    save_best_only=True,
    save_weights_only=True,
    mode="auto",
    )
    # CSVLogger
    log = tf.keras.callbacks.CSVLogger(filename=(name_str + '.csv'), append=True)
    callbacks = [early_stopping, ckpt, reduce_lr, log]
    history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=128, validation_split=0.2,
                        callbacks=callbacks)
    model.load_weights(ckpt_filepath)
    
    y_predicted1 = model.predict(X_test1)
    y_predicted1 = y_predicted1.reshape(y_test1.shape)
    y_predicted2 = model.predict(X_test2)
    y_predicted2 = y_predicted2.reshape(y_test2.shape)
    y_predicted3 = model.predict(X_test3)
    y_predicted3 = y_predicted3.reshape(y_test3.shape)
    coef1 = np.corrcoef(y_test1, y_predicted1)[0,1] ** 2
    coef2 = np.corrcoef(y_test2, y_predicted2)[0,1] ** 2
    coef3 = np.corrcoef(y_test3, y_predicted3)[0,1] ** 2
    eval_1 = model.evaluate(X_test1, y_test1)
    eval_2 = model.evaluate(X_test2, y_test2)
    eval_3 = model.evaluate(X_test3, y_test3)
    result = {'Model': [name_str]*12,
              'Train_site': [f'{site_1}, {site_2}'] * 12,
              'Test_site': [site_1] * 4 + [site_2] * 4 + [site_3] * 4,
              'Metric': ['MSE', 'RMSE', 'MAE', 'coef'] * 3,
              'Value': [eval_1[1], eval_1[2], eval_1[3], coef1,
                        eval_2[1], eval_2[2], eval_2[3], coef2,
                        eval_3[1], eval_3[2], eval_3[3], coef3]}
    df = pd.DataFrame(result)
    return df, history

### cnn_resnet

#### single-cross-site

In [12]:
df_resnet1, history_resnet1 = single_cross_site(cnn_resnet, 'cnn_resnet', 0.01, 0, 3, 4, X, y)
df_resnet2, history_resnet2 = single_cross_site(cnn_resnet, 'cnn_resnet', 0.01, 3, 0, 4, X, y)
df_resnet3, history_resnet3 = single_cross_site(cnn_resnet, 'cnn_resnet', 0.01, 4, 0, 3, X, y)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100, 72)]    0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 100, 64)      36928       ['input_1[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (None, 100, 64)     256         ['conv1d[0][0]']                 
 alization)                                                                                       
                                                                                                  
 activation (Activation)        (None, 100, 64)      0           ['batch_normalization[0][0]']

In [13]:
df_resnet_1 = (df_resnet1.merge(df_resnet2, how='outer')).merge(df_resnet3, how='outer')

#### double-cross-site

In [14]:
df_resnet4, hisory_resnet4 = double_cross_site(cnn_resnet, 'cnn_resnet', 0.01, 3, 4, 0, X, y)
df_resnet5, hisory_resnet5 = double_cross_site(cnn_resnet, 'cnn_resnet', 0.01, 0, 4, 3, X, y)
df_resnet6, hisory_resnet6 = double_cross_site(cnn_resnet, 'cnn_resnet', 0.01, 0, 3, 4, X, y)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 100, 72)]    0           []                               
                                                                                                  
 conv1d_33 (Conv1D)             (None, 100, 64)      36928       ['input_4[0][0]']                
                                                                                                  
 batch_normalization_36 (BatchN  (None, 100, 64)     256         ['conv1d_33[0][0]']              
 ormalization)                                                                                    
                                                                                                  
 activation_36 (Activation)     (None, 100, 64)      0           ['batch_normalization_36[0]

In [15]:
df_resnet_2 = (df_resnet4.merge(df_resnet5, how='outer')).merge(df_resnet6, how='outer')

#### cost0

In [16]:
resnet_cost0 = df_resnet_1[(df_resnet_1['Metric'] == 'MAE') & (df_resnet_1["Test_site"] == 0)]

In [17]:
resnet_cost0

Unnamed: 0,Model,Train_site,Test_site,Metric,Value
2,cnn_resnet,0,0,MAE,1.710442
18,cnn_resnet,3,0,MAE,3.654546
30,cnn_resnet,4,0,MAE,2.705386


In [18]:
resnet_cost0_3 = (resnet_cost0[resnet_cost0['Train_site'] == 3]['Value'].values - 
                  resnet_cost0[resnet_cost0['Train_site'] == 0]['Value'].values)
resnet_cost0_4 = (resnet_cost0[resnet_cost0['Train_site'] == 4]['Value'].values - 
                  resnet_cost0[resnet_cost0['Train_site'] == 0]['Value'].values)

In [19]:
resnet_cost0_3, resnet_cost0_4

(array([1.94410467]), array([0.9949441]))

In [20]:
resnet_cost0_mean = np.mean([resnet_cost0_3, resnet_cost0_4])

In [21]:
resnet_cost0_0 = df_resnet_2[(df_resnet_2['Metric'] == 'MAE') & (df_resnet_2["Test_site"] == 0)]

In [22]:
resnet_cost0_0

Unnamed: 0,Model,Train_site,Test_site,Metric,Value
10,cnn_resnet,"3, 4",0,MAE,3.448604
14,cnn_resnet,"0, 4",0,MAE,2.028689
26,cnn_resnet,"0, 3",0,MAE,2.582357


In [23]:
resnet_cost0_34 = (resnet_cost0_0[resnet_cost0_0['Train_site'] == '3, 4']['Value'].values - 
                   resnet_cost0[resnet_cost0['Train_site'] == 0]['Value'].values)

In [24]:
resnet_cost0_mean, resnet_cost0_34

(1.4695243835449219, array([1.73816276]))

#### cost3

In [25]:
resnet_cost3 = df_resnet_1[(df_resnet_1['Metric'] == 'MAE') & (df_resnet_1["Test_site"] == 3)]

In [26]:
resnet_cost3_0 = (resnet_cost3[resnet_cost3['Train_site'] == 0]['Value'].values - 
                  resnet_cost3[resnet_cost3['Train_site'] == 3]['Value'].values)
resnet_cost3_4 = (resnet_cost3[resnet_cost3['Train_site'] == 4]['Value'].values - 
                  resnet_cost3[resnet_cost3['Train_site'] == 3]['Value'].values)

In [27]:
resnet_cost3_0, resnet_cost3_4

(array([0.71193767]), array([1.04738188]))

In [28]:
resnet_cost3_mean = np.mean([resnet_cost3_0, resnet_cost3_4])

In [29]:
resnet_cost3_3 = df_resnet_2[(df_resnet_2['Metric'] == 'MAE') & (df_resnet_2["Test_site"] == 3)]

In [30]:
resnet_cost3_3

Unnamed: 0,Model,Train_site,Test_site,Metric,Value
2,cnn_resnet,"3, 4",3,MAE,2.09307
22,cnn_resnet,"0, 4",3,MAE,2.882586
30,cnn_resnet,"0, 3",3,MAE,3.038788


In [31]:
resnet_cost3_04 = (resnet_cost3_3[resnet_cost3_3['Train_site'] == '0, 4']['Value'].values - 
                   resnet_cost3[resnet_cost3['Train_site'] == 3]['Value'].values)

In [32]:
resnet_cost3_mean, resnet_cost3_04

(0.8796597719192505, array([0.83564425]))

#### cost4

In [33]:
resnet_cost4 = df_resnet_1[(df_resnet_1['Metric'] == 'MAE') & (df_resnet_1["Test_site"] == 4)]

In [34]:
resnet_cost4

Unnamed: 0,Model,Train_site,Test_site,Metric,Value
10,cnn_resnet,0,4,MAE,2.44462
22,cnn_resnet,3,4,MAE,2.514535
26,cnn_resnet,4,4,MAE,2.207601


In [35]:
resnet_cost4_0 = (resnet_cost4[resnet_cost4['Train_site'] == 0]['Value'].values - 
                  resnet_cost4[resnet_cost4['Train_site'] == 4]['Value'].values)
resnet_cost4_3 = (resnet_cost4[resnet_cost4['Train_site'] == 3]['Value'].values - 
                  resnet_cost4[resnet_cost4['Train_site'] == 4]['Value'].values)

In [36]:
resnet_cost4_mean = np.mean([resnet_cost4_0, resnet_cost4_3])

In [37]:
resnet_cost4_4 = df_resnet_2[(df_resnet_2['Metric'] == 'MAE') & (df_resnet_2["Test_site"] == 4)]

In [38]:
resnet_cost4_4

Unnamed: 0,Model,Train_site,Test_site,Metric,Value
6,cnn_resnet,"3, 4",4,MAE,1.969359
18,cnn_resnet,"0, 4",4,MAE,2.081152
34,cnn_resnet,"0, 3",4,MAE,2.765437


In [39]:
resnet_cost4_03 = (resnet_cost4_4[resnet_cost4_4['Train_site'] == '0, 3']['Value'].values - 
                   resnet_cost4[resnet_cost4['Train_site'] == 4]['Value'].values)

In [40]:
resnet_cost4_mean, resnet_cost4_03

(0.2719759941101074, array([0.55783606]))