In [1]:
import afqinsight.nn.tf_models as nn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from afqinsight.datasets import AFQDataset
from afqinsight.nn.tf_models import cnn_lenet, mlp4, cnn_vgg, lstm1v0, lstm1, lstm2, blstm1, blstm2, lstm_fcn, cnn_resnet
from sklearn.impute import SimpleImputer
import os.path
# Harmonization
from sklearn.model_selection import train_test_split
from neurocombat_sklearn import CombatModel
import pandas as pd
from sklearn.utils import shuffle, resample
from afqinsight.augmentation import jitter, time_warp, scaling
import tempfile
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
afq_dataset = AFQDataset.from_files(
    fn_nodes="../data/raw/combined_tract_profiles.csv",
    fn_subjects="../data/raw/participants_updated_id.csv",
    dwi_metrics=["dki_fa", "dki_md", "dki_mk"],
    index_col="subject_id",
    target_cols=["age", "dl_qc_score", "scan_site_id"],
    label_encode_cols=["scan_site_id"]
)

In [3]:
afq_dataset.drop_target_na()

In [4]:
print(len(afq_dataset.subjects))
print(afq_dataset.X.shape)
print(afq_dataset.y.shape)

1865
(1865, 7200)
(1865, 3)


In [5]:
full_dataset = list(afq_dataset.as_tensorflow_dataset().as_numpy_iterator())

In [6]:
X = np.concatenate([xx[0][None] for xx in full_dataset], 0)
y = np.array([yy[1][0] for yy in full_dataset])
qc = np.array([yy[1][1] for yy in full_dataset])
site = np.array([yy[1][2] for yy in full_dataset])

In [7]:
X = X[qc>0]
y = y[qc>0]
site = site[qc>0]

In [8]:
X.shape

(1817, 100, 72)

In [9]:
n_epochs = 1000

# EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    mode="min",
    patience=100
)

# ReduceLROnPlateau
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=20,
    verbose=1,
)

In [10]:
def augment_this(X, y, rounds=2): 
    new_X = X[:]
    new_y = y[:]
    for f in range(rounds): 
        aug_X = np.zeros_like(X)
        # Do each channel separately:
        for channel in range(aug_X.shape[-1]):
            this_X = X[..., channel][..., np.newaxis]
            this_X = jitter(this_X, sigma=np.mean(this_X)/25)
            this_X = scaling(this_X, sigma=np.mean(this_X)/25)
            this_X = time_warp(this_X, sigma=np.mean(this_X)/25)
            aug_X[..., channel] = this_X[...,0]
        new_X = np.concatenate([new_X, aug_X])
        new_y = np.concatenate([new_y, y])
    return new_X, new_y 

In [11]:
X0 = X[site==0]
y0 = y[site==0]
X3 = X[site==3]
y3 = y[site==3]
X4 = X[site==4]
y4 = y[site==4]


In [12]:
X0.shape, X3.shape, X4.shape

((755, 100, 72), (743, 100, 72), (253, 100, 72))

In [13]:
def model_fit(model_func, X_train, y_train):
    
    model = model_func(input_shape=(100, X_train.shape[-1]), n_classes=1, output_activation=None, verbose=True)
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  metrics=['mean_squared_error', 
                           tf.keras.metrics.RootMeanSquaredError(name='rmse'), 
                           'mean_absolute_error'])
    # ModelCheckpoint
    ckpt_filepath = tempfile.NamedTemporaryFile().name + '.h5'
    ckpt = tf.keras.callbacks.ModelCheckpoint(
        filepath = ckpt_filepath,
        monitor="val_loss",
        verbose=0,
        save_best_only=True,
        save_weights_only=True,
        mode="auto",
        )
    callbacks = [early_stopping, ckpt, reduce_lr]
    history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=128, validation_split=0.2,
                        callbacks=callbacks, verbose=0, use_multiprocessing=True)
    model.load_weights(ckpt_filepath)
    return model


In [14]:
def multi_site(model_func, name_str, lr, X, y, random_states, augment=True):
    # Split the data by sites
    X0 = X[site==0]
    y0 = y[site==0]
    X3 = X[site==3]
    y3 = y[site==3]
    X4 = X[site==4]
    y4 = y[site==4]

    # We downsample each site down to the size of the smallest site:
    sample_size = X4.shape[0]
    X0, y0 = resample(X0, y0, n_samples=sample_size, replace=False, random_state=random_states[0])
    X3, y3 = resample(X3, y3, n_samples=sample_size, replace=False, random_state=random_states[1])
    X4, y4 = resample(X4, y4, n_samples=sample_size, replace=False, random_state=random_states[2])
    
    # Split the data into train and test sets:
    X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, 
                                                            test_size=0.2, 
                                                            random_state=random_states[0])
    X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, 
                                                            test_size=0.2, 
                                                            random_state=random_states[1])
    
    X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, 
                                                            test_size=0.2, 
                                                            random_state=random_states[2])
    

    imputer = SimpleImputer(strategy="median")
    # Impute train and test separately:
    X0_train = np.concatenate([imputer.fit_transform(X0_train[..., ii])[:, :, None] for ii in range(X0_train.shape[-1])], -1)
    X0_test = np.concatenate([imputer.fit_transform(X0_test[..., ii])[:, :, None] for ii in range(X0_test.shape[-1])], -1)
    X3_train = np.concatenate([imputer.fit_transform(X3_train[..., ii])[:, :, None] for ii in range(X3_train.shape[-1])], -1)
    X3_test = np.concatenate([imputer.fit_transform(X3_test[..., ii])[:, :, None] for ii in range(X3_test.shape[-1])], -1)
    X4_train = np.concatenate([imputer.fit_transform(X4_train[..., ii])[:, :, None] for ii in range(X4_train.shape[-1])], -1)
    X4_test = np.concatenate([imputer.fit_transform(X4_test[..., ii])[:, :, None] for ii in range(X4_test.shape[-1])], -1)
    
    # Downsample again to have the size of the smallest single site training set:
    sample_size = X4_train.shape[0]
    X0_train, y0_train = resample(X0_train, y0_train, n_samples=sample_size//2, replace=False, random_state=random_states[0])
    X3_train, y3_train = resample(X3_train, y3_train, n_samples=sample_size//2, replace=False, random_state=random_states[1])
    X4_train, y4_train = resample(X4_train, y4_train, n_samples=sample_size//2, replace=False, random_state=random_states[2])
    
    X03_train = np.concatenate([X0_train, X3_train])
    y03_train = np.concatenate([y0_train, y3_train])
    X04_train = np.concatenate([X0_train, X4_train])
    y04_train = np.concatenate([y0_train, y4_train])
    X43_train = np.concatenate([X4_train, X3_train])
    y43_train = np.concatenate([y4_train, y3_train])


    if augment:
        X03_train, y03_train = augment_this(X03_train, y03_train, rounds=6)
        X03_train, y03_train = shuffle(X03_train, y03_train)
        X3_train, y3_train = augment_this(X3_train, y3_train, rounds=6)
        X3_train, y3_train = shuffle(X3_train, y3_train)
        X4_train, y4_train = augment_this(X4_train, y4_train, rounds=6)
        X4_train, y4_train = shuffle(X4_train, y4_train)


    train_data = {(0, 3): [X03_train, y03_train], 
                  (0, 4): [X04_train, y04_train],
                  (4, 3): [X43_train, y43_train]}

    test_data = {0: [X0_test, y0_test], 
                 3: [X3_test, y3_test],
                 4: [X4_test, y4_test]}

    train_site = []
    test_site = []
    metric = []
    value = []

    # Train on each one separately and test on all of them
    for train in train_data: 
        X_train, y_train = train_data[train]
        trained = model_fit(model_func, X_train, y_train)
        for test in test_data:
            X_test, y_test = test_data[test]
            y_pred = trained.predict(X_test)
            train_site.append([f"{train[0]}, {train[1]}"]*3)
            test_site.append([test]*3)
            metric.append("mae")
            value.append(mean_absolute_error(y_test, y_pred))
            metric.append("mad")
            value.append(median_absolute_error(y_test, y_pred))
            metric.append("r2")
            value.append(r2_score(y_test, y_pred))
    
    result = {'Model': [name_str] * 27,
              'Train_site': np.array(train_site).ravel(),
              'Test_site': np.array(test_site).ravel(),
              'Metric': metric,
              'Value': value}
    df = pd.DataFrame(result)
    return df

In [15]:
model_dict = {"cnn_lenet": {"model": cnn_lenet, "lr": 0.001}, 
              "mlp4": {"model": mlp4, "lr": 0.001},
              "cnn_vgg": {"model": cnn_vgg, "lr": 0.001},
              "lstm1v0": {"model": lstm1v0, "lr": 0.01},
              "lstm1": {"model": lstm1, "lr": 0.01},
              "lstm2": {"model": lstm2, "lr": 0.01},
              "blstm1": {"model": blstm1, "lr": 0.01},
              "blstm2": {"model": blstm1, "lr": 0.01},
              "lstm_fcn": {"model": lstm_fcn, "lr": 0.01},
              "cnn_resnet": {"model": cnn_resnet, "lr": 0.01}
             }

In [16]:
n_runs = 10

In [17]:
random_states = np.abs(np.floor(np.random.randn(3 * n_runs )*1000)).astype(int).reshape((n_runs, -1))

In [18]:
dfs = []
for model_name in model_dict:
    model_func = model_dict[model_name]["model"]
    lr = model_dict[model_name]["lr"]
    print("##################################################")
    print("model: ", model_name)
    for ii in range(n_runs):     
        dfs.append(multi_site(model_func, model_name, lr, X, y, random_states[ii], augment=False))
        one_df = pd.concat(dfs)
        one_df.to_csv("multi_site_noaug.csv")

##################################################
model:  cnn_lenet
pooling layers: 4
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 72)]         0         
                                                                 
 conv1d (Conv1D)             (None, 100, 6)            1302      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 6)            0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 50, 16)            304       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 25, 16)           0         
 1D)                                                             
                                        

In [19]:
debug


ERROR:root:No traceback has been produced, nothing to debug.
