# Conditional density estimation with noise regularization

In [1]:
import warnings
warnings.filterwarnings('ignore')
from utils import create_dataset_mri, cv_for_cde, create_dataset_eeg
from cde.density_estimator import MixtureDensityNetwork
import numpy as np
import tensorflow as tf
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from tensorflow.python.keras.activations import tanh
from sklearn.impute import SimpleImputer
import pandas as pd

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
import tensorflow as tf
# torch version
print(tf.__version__)

1.7.0


## Structural MRI

In [3]:
# Create dataset
target = 'Age'
# use all MRI high-level features, no DTI
data = create_dataset_mri(SCORE = target)
# for the moment, remove the diagnosis colums
data.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01'], inplace=True)

In [4]:
test_indices = pd.read_csv('data/test_IDS.csv')
# Separate test and train set
data_test = pd.merge(data, test_indices, on='ID', how='inner')
data_train = data.loc[~data['ID'].isin(list((set(test_indices['ID']))))]
# labels and features
y_train = data_train[target]
X_train = data_train.drop([target, 'ID'], axis=1)
y_test = data_test[target]
X_test = data_test.drop([target, 'ID'], axis=1)


X_test = np.array(X_test)
X_train = np.array(X_train)
y_test = np.array(y_test)
y_train = np.array(y_train)
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

In [5]:
# Set model parameters
ndim_x=X_train.shape[1]
ndim_y=y_train.shape[1]
# We try the "faster decay rate for non-gaussian data" proposed in the paper: h = n^(-1/(d+1))
n = X_train.shape[0]
d = X_train.shape[1]+y_train.shape[1]
h = n**(-1/(d+1))

print('h = {}'.format(h))

# Define the model
model = MixtureDensityNetwork('MDN_mri', ndim_x, ndim_y, n_centers=10, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh,
               n_training_epochs=1000, x_noise_std=h, y_noise_std=h, adaptive_noise_fn=None, entropy_reg_coef=0.0,
               weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0, l2_reg=0.0, l1_reg=0.0,
               random_seed=42)
# Fit
model.fit(X_train, y_train)
# Predict
y_pred = model.mean_(X_test)
y_pred = y_pred.reshape((-1,1))
y_pred.shape
print('Test MSE: {}'.format(mean_squared_error(y_pred, y_test)))

h = 0.9814850749319165
1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 1500.004
mean log-loss train: 1.4620
Test MSE: 5.612370445606043


In [6]:
# CV results
res = cv_for_cde(X_train, y_train.flatten(), 'mri', h, n_splits=5)
np.mean(res)

1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 1190.518
mean log-loss train: 1.4519
MSE: 5.463922072129362
1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 1210.741
mean log-loss train: 1.4747
MSE: 5.0702302959467325
1000/1000 [100%] ██████████████████████████████ Elapsed: 11s | loss: 1192.160
mean log-loss train: 1.4521
MSE: 4.651610983115361
1000/1000 [100%] ██████████████████████████████ Elapsed: 12s | loss: 1204.898
mean log-loss train: 1.4676
MSE: 5.1260233999718965
1000/1000 [100%] ██████████████████████████████ Elapsed: 12s | loss: 1217.456
mean log-loss train: 1.4829
MSE: 5.625973580891862


5.187552066411043

Do the same, without noise regularization

## Structural MRI + DTI

In [3]:
# Create dataset
target = 'Age'
# use all MRI high-level features, plus DTI
data = create_dataset_mri(SCORE = target, DTI = True)
# for the moment, remove the diagnosis colums
data.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01'], inplace=True)

In [4]:
test_indices = pd.read_csv('data/test_IDS.csv')
# Separate test and train set
data_test = pd.merge(data, test_indices, on='ID', how='inner')
data_train = data.loc[~data['ID'].isin(list((set(test_indices['ID']))))]
# labels and features
y_train = data_train[target]
X_train = data_train.drop([target, 'ID'], axis=1)
y_test = data_test[target]
X_test = data_test.drop([target, 'ID'], axis=1)


X_test = np.array(X_test)
X_train = np.array(X_train)
y_test = np.array(y_test)
y_train = np.array(y_train)
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

In [10]:
# Impute missing DTI values
imp = SimpleImputer(strategy = 'median')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [12]:
# Set model parameters
ndim_x=X_train.shape[1]
ndim_y=y_train.shape[1]
# We try the "faster decay rate for non-gaussian data" proposed in the paper: h = n^(-1/(d+1))
n = X_train.shape[0]
d = X_train.shape[1]+y_train.shape[1]
h = n**(-1/(d+1))
# Define the model
model = MixtureDensityNetwork('MDN_dti', ndim_x, ndim_y, n_centers=10, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh,
               n_training_epochs=1000, x_noise_std=h, y_noise_std=h, adaptive_noise_fn=None, entropy_reg_coef=0.0,
               weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0, l2_reg=0.0, l1_reg=0.0,
               random_seed=42)
# Fit
model.fit(X_train, y_train)
# Predict
y_pred = model.mean_(X_test)
y_pred = y_pred.reshape((-1,1))
y_pred.shape
print('Test MSE: {}'.format(mean_squared_error(y_pred, y_test)))

1000/1000 [100%] ██████████████████████████████ Elapsed: 14s | loss: 1037.765
mean log-loss train: 1.4454
Test MSE: 5.858093329987868


In [13]:
# CV results
res = cv_for_cde(X_train, y_train.flatten(), 'dti', h, n_splits=5)
np.mean(res)

1000/1000 [100%] ██████████████████████████████ Elapsed: 13s | loss: 848.138
mean log-loss train: 1.4776
MSE: 5.405847079573783
1000/1000 [100%] ██████████████████████████████ Elapsed: 13s | loss: 838.366
mean log-loss train: 1.4606
MSE: 5.960919393053572
1000/1000 [100%] ██████████████████████████████ Elapsed: 13s | loss: 868.571
mean log-loss train: 1.5132
MSE: 4.763439354759247
1000/1000 [100%] ██████████████████████████████ Elapsed: 15s | loss: 840.152
mean log-loss train: 1.4611
MSE: 4.8010148320364685
1000/1000 [100%] ██████████████████████████████ Elapsed: 15s | loss: 856.639
mean log-loss train: 1.4898
MSE: 4.645745340876976


5.1153932000600095

## EEG

In [28]:
# Create dataset
target = 'Age'
# eeg cluster features
data = create_dataset_eeg(SCORE = target, clusters = True)
# for the moment, remove the diagnosis colums
data.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01'], inplace=True)
data.rename(columns={"id": "ID"}, inplace=True)

In [19]:
test_indices = pd.read_csv('data/test_IDS.csv')
# Separate test and train set
data_test = pd.merge(data, test_indices, on='ID', how='inner')
data_train = data.loc[~data['ID'].isin(list((set(test_indices['ID']))))]
# labels and features
y_train = data_train[target]
X_train = data_train.drop([target, 'ID'], axis=1)
y_test = data_test[target]
X_test = data_test.drop([target, 'ID'], axis=1)


X_test = np.array(X_test)
X_train = np.array(X_train)
y_test = np.array(y_test)
y_train = np.array(y_train)
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

# Impute missing values
imp = SimpleImputer(strategy = 'median')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [21]:
# Set model parameters
ndim_x=X_train.shape[1]
ndim_y=y_train.shape[1]
# We try the "faster decay rate for non-gaussian data" proposed in the paper: h = n^(-1/(d+1))
n = X_train.shape[0]
d = X_train.shape[1]+y_train.shape[1]
h = n**(-1/(d+1))
print('h = {}'.format(h))
# Define the model
model = MixtureDensityNetwork('MDN_eeg', ndim_x, ndim_y, n_centers=10, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh,
               n_training_epochs=1000, x_noise_std=h, y_noise_std=h, adaptive_noise_fn=None, entropy_reg_coef=0.0,
               weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0, l2_reg=0.0, l1_reg=0.0,
               random_seed=42)
# Fit
model.fit(X_train, y_train)
# Predict
y_pred = model.mean_(X_test)
y_pred = y_pred.reshape((-1,1))
y_pred.shape
print('Test MSE: {}'.format(mean_squared_error(y_pred, y_test)))

h = 0.9768343244122416
1000/1000 [100%] ██████████████████████████████ Elapsed: 18s | loss: 1829.123
mean log-loss train: 1.5423
Test MSE: 6.244518418716428


In [22]:
# CV results
res = cv_for_cde(X_train, y_train.flatten(), 'eeg', h, n_splits=5)
np.mean(res)

1000/1000 [100%] ██████████████████████████████ Elapsed: 17s | loss: 1423.387
mean log-loss train: 1.5015
MSE: 6.639125967481024
1000/1000 [100%] ██████████████████████████████ Elapsed: 18s | loss: 1407.995
mean log-loss train: 1.4837
MSE: 5.6885191118183736
1000/1000 [100%] ██████████████████████████████ Elapsed: 19s | loss: 1472.065
mean log-loss train: 1.5512
MSE: 5.583397366047567
1000/1000 [100%] ██████████████████████████████ Elapsed: 21s | loss: 1492.313
mean log-loss train: 1.5725
MSE: 5.173750931694996
1000/1000 [100%] ██████████████████████████████ Elapsed: 21s | loss: 1450.329
mean log-loss train: 1.5283
MSE: 6.13996843113422


5.844952361635237

Let us see EEG with na removed rather than imputed

In [30]:
# Create dataset
target = 'Age'
# eeg cluster features
data = create_dataset_eeg(SCORE = target, clusters = True)
# for the moment, remove the diagnosis colums
data.drop(columns=['DX_01_Cat', 'DX_01_Sub', 'DX_01'], inplace=True)
data.rename(columns={"id": "ID"}, inplace=True)
data.dropna(axis  =0, inplace=True)
test_indices = pd.read_csv('data/test_IDS.csv')
# Separate test and train set
data_test = pd.merge(data, test_indices, on='ID', how='inner')
data_train = data.loc[~data['ID'].isin(list((set(test_indices['ID']))))]
# labels and features
y_train = data_train[target]
X_train = data_train.drop([target, 'ID'], axis=1)
y_test = data_test[target]
X_test = data_test.drop([target, 'ID'], axis=1)


X_test = np.array(X_test)
X_train = np.array(X_train)
y_test = np.array(y_test)
y_train = np.array(y_train)
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))


In [31]:
# Set model parameters
ndim_x=X_train.shape[1]
ndim_y=y_train.shape[1]
# We try the "faster decay rate for non-gaussian data" proposed in the paper: h = n^(-1/(d+1))
n = X_train.shape[0]
d = X_train.shape[1]+y_train.shape[1]
h = n**(-1/(d+1))
print('h = {}'.format(h))
# Define the model
model = MixtureDensityNetwork('MDN_eeg_2', ndim_x, ndim_y, n_centers=10, hidden_sizes=(16, 16), hidden_nonlinearity=tf.nn.tanh,
               n_training_epochs=1000, x_noise_std=h, y_noise_std=h, adaptive_noise_fn=None, entropy_reg_coef=0.0,
               weight_decay=0.0, weight_normalization=True, data_normalization=True, dropout=0.0, l2_reg=0.0, l1_reg=0.0,
               random_seed=42)
# Fit
model.fit(X_train, y_train)
# Predict
y_pred = model.mean_(X_test)
y_pred = y_pred.reshape((-1,1))
y_pred.shape
print('Test MSE: {}'.format(mean_squared_error(y_pred, y_test)))

h = 0.9787417767039791
1000/1000 [100%] ██████████████████████████████ Elapsed: 22s | loss: 981.143
mean log-loss train: 1.4911
Test MSE: 7.931212357043834


In [33]:
# CV results
res = cv_for_cde(X_train, y_train.flatten(), 'eeg_2', h, n_splits=5)
np.mean(res)

1000/1000 [100%] ██████████████████████████████ Elapsed: 24s | loss: 804.114
mean log-loss train: 1.5287
MSE: 4.805054557384899
1000/1000 [100%] ██████████████████████████████ Elapsed: 24s | loss: 774.457
mean log-loss train: 1.4724
MSE: 6.072229746826974
1000/1000 [100%] ██████████████████████████████ Elapsed: 26s | loss: 754.245
mean log-loss train: 1.4339
MSE: 6.048366685498241
1000/1000 [100%] ██████████████████████████████ Elapsed: 27s | loss: 790.847
mean log-loss train: 1.5007
MSE: 6.1304244009061994
1000/1000 [100%] ██████████████████████████████ Elapsed: 30s | loss: 791.639
mean log-loss train: 1.5022
MSE: 5.654705608081803


5.7421561997396235

# Comments

Not necessary to scale the input data (the model already does normalization).
Note that with DTI we obtain results similar to pure MRI. EEG data gives the worst results.