In [1]:
# Import libraries

import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn.svm as svm
import sklearn.pipeline as spl
import sklearn.kernel_ridge as skr
import sklearn.model_selection as sms
import sklearn.linear_model as slm
import sklearn.preprocessing as skp
import sklearn.neural_network as snn
import sklearn.metrics as sme
import sklearn.decomposition as sdc
import sklearn.cross_decomposition as skd
import sklearn.feature_selection as skf
import sklearn.ensemble as ske
import sklearn.utils as sku
from sklearnex import patch_sklearn, config_context
from sklearn.cluster import DBSCAN
import numpy as np
import scipy.stats as stats
from IPython.display import HTML
import util

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
HTML('''
<style>
.jupyter-matplotlib {
    background-color: #000;
}

.widget-label, .jupyter-matplotlib-header{
    color: #fff;
}

.jupyter-button {
    background-color: #333;
    color: #fff;
}
</style>
''')

In [3]:
# Maybe y should also be scaled? 
#     Performed poorly using Standard and MinMax scalers. Trying with LOOCV to see if predictions stabilize.
#     Does not appear to stabilize predictions with LOOCV (using StandardScaler())
# Perhaps a transform would be more effective, or scaling implemented with consistent cross-validation
# Different scaling methods? 
#     This seems most important for noise-sensitive models like LARS. All other use StandardScaler()
# Transformers?
# Model-specific scaling methods?
#     Yes, see above
# Common cross-validation function?
#     Use built-in functions wherever possible and `utils.gridsearch_pickparams()` elsewhere
# Quantile loss
# RANSAC
# Data augmentation? (Mixup)
# Data generation? (SMOGN)

In [4]:
# Get case IDs
case_list = open('/home/ali/RadDBS-QSM/data/docs/cases_90','r')
lines = case_list.read()
lists = np.loadtxt(case_list.name,comments="#", delimiter=",",unpack=False,dtype=str)
case_id = []
for lines in lists:     
    case_id.append(lines[-9:-7])

# Load scores
file_dir = '/home/ali/RadDBS-QSM/data/docs/QSM anonymus- 6.22.2023-1528.csv'
motor_df = util.filter_scores(file_dir,'pre-dbs updrs','stim','CORNELL ID')
# Find cases with all required scores
subs_init,pre_imp_init,post_imp_init,pre_updrs_off_init = util.get_full_cases(motor_df,
                                                          'CORNELL ID',
                                                          'OFF (pre-dbs updrs)',
                                                          'ON (pre-dbs updrs)',
                                                          'OFF meds ON stim 6mo')


In [5]:
# Load extracted features
npy_dir = '/home/ali/RadDBS-QSM/data/npy/slices/'
phi_dir = '/home/ali/RadDBS-QSM/data/phi/slices/'
roi_path = '/data/Ali/atlas/mcgill_pd_atlas/PD25-subcortical-labels.csv'
n_rois = 6
Phi_all, X_all, R_all, K_all, ID_all = util.load_featstruct(phi_dir,npy_dir+'X/',npy_dir+'R/',npy_dir+'K/',n_rois,939,True)
ids = np.asarray(ID_all).astype(int)

Appended 1000 slices
Appended 2000 slices
Appended 3000 slices
Appended 4000 slices
Appended 5000 slices
Appended 6000 slices
Allocated arrays


In [6]:
# Find overlap between scored subjects and feature extraction cases
c_cases = np.intersect1d(np.asarray(case_id).astype(int),np.asarray(subs_init).astype(int))
# Complete case indices with respect to feature matrix
c_cases_idx = np.in1d(ids,c_cases)
X_all_c = X_all[c_cases_idx,:,:]
# K_all_c = K_all[c_cases_idx,:,:]
# R_all_c = R_all[c_cases_idx,:,:]
# Re-index the scored subjects with respect to complete cases
s_cases_idx = np.in1d(subs_init,ids[c_cases_idx])
subs_init = subs_init[s_cases_idx]
pre_imp_init = pre_imp_init[s_cases_idx]
post_imp_init = post_imp_init[s_cases_idx]
pre_updrs_off_init = pre_updrs_off_init[s_cases_idx]
per_change_init = post_imp_init
subs = np.asarray(ID_all,dtype=float)[np.in1d(np.asarray(ID_all,dtype=float),subs_init)]

In [7]:
pre_imp = np.zeros((1,len(subs))).T
post_imp = np.zeros((1,len(subs))).T
pre_updrs_off = np.zeros((1,len(subs))).T
per_change = np.zeros((1,len(subs))).T
for j in np.arange(len(subs)):
    pre_imp[j] = pre_imp_init[subs_init == subs[j]]
    post_imp[j] = post_imp_init[subs_init == subs[j]]
    pre_updrs_off[j] = pre_updrs_off_init[subs_init == subs[j]]
    per_change[j] = per_change_init[subs_init == subs[j]]

In [8]:
test_id = np.random.choice(np.unique(subs))
test_index = subs == test_id
train_index = subs != test_id
X_train = X_all_c[train_index,:,:]
X_test = X_all_c[test_index,:,:]
y_train = per_change[train_index]
y_test = per_change[test_index]


In [9]:
X_train, y_train = sku.shuffle(X_train, y_train, random_state=1)

In [10]:
#X_train,X_test,y_train,y_test,train_index,test_index = util.set_split(X_all_c,per_change,1,10/len(X_all_c))

# Cross validation
cvn = 40
# Choose scaling
X0_ss0,scaler_ss,X_test_ss0 = util.model_scale(skp.StandardScaler(),
                                             X_train,train_index,X_test,test_index,pre_updrs_off.ravel())
# Feature selection
# sel = skf.SelectKBest(skf.f_regression,k=X0_ss0.shape[1])
X0_ss = X0_ss0# sel.fit_transform(X0_ss0,y_train)
X_test_ss = X_test_ss0 # (sel.transform(X_test_ss0.reshape(X_test_ss0.shape[0],X_test_ss0.shape[1]*X_test_ss0.shape[2]))).reshape((X_test_ss0.shape[0],1,-1))

In [11]:
scoring = 'r2'
print(y_test.mean())
print(y_train.mean())

0.7435897435897436
0.6484936161919129


In [12]:
alphas = np.logspace(-9,-2,10)

In [13]:
lr = slm.LinearRegression()
est_lr = lr.fit(X0_ss,y_train)
results_lr = est_lr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))
print(results_lr)

[[0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64688286]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64688286]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]
 [0.64690861]]


In [14]:
br_grid = {'alpha_1': alphas[-5:-4], 'alpha_2': alphas[-5:-4]}

best_params = util.gridsearch_pickparams(slm.BayesianRidge(),cvn,
                                         br_grid,X0_ss0,
                                         y_train.ravel(),scoring,8)
br = slm.BayesianRidge(alpha_1=best_params['alpha_1'],alpha_2=best_params['alpha_2'])
br.fit(X0_ss, y_train.ravel())
results_br = np.asarray(br.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()
print(results_br)

Fitting 40 folds for each of 1 candidates, totalling 40 fits
[0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181
 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181 0.64838181]


In [15]:
mlp_grid = {'hidden_layer_sizes': [(X_train.shape[1],X_train.shape[2])],
          'activation': ['relu'],
          'alpha': alphas,
          'epsilon': [1e0],
          'solver': ['adam'],
          'max_iter':[5000]}

best_params = util.gridsearch_pickparams(snn.MLPRegressor(),
                                         cvn,
                                         mlp_grid,X0_ss0,
                                         y_train.ravel(),scoring,8)

mlp = snn.MLPRegressor(hidden_layer_sizes=best_params["hidden_layer_sizes"], 
                        activation=best_params["activation"],
                        solver=best_params["solver"],
                        alpha=best_params['alpha'],
                        epsilon=best_params["epsilon"],
                        max_iter=5000, 
                        n_iter_no_change=500, 
                        verbose=True,
                        early_stopping=True,
                        random_state=1,
                        batch_size=len(X0_ss)//cvn)

mlp.fit(X0_ss,y_train)
results_mlp = mlp.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))

print(results_mlp)

Fitting 40 folds for each of 10 candidates, totalling 400 fits


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.30046592
Validation score: -9.743576
Iteration 2, loss = 0.28633622
Validation score: -9.201631
Iteration 3, loss = 0.26805752
Validation score: -8.567452
Iteration 4, loss = 0.24941071
Validation score: -7.886818
Iteration 5, loss = 0.23041189
Validation score: -7.188875
Iteration 6, loss = 0.21195614
Validation score: -6.501813
Iteration 7, loss = 0.19412675
Validation score: -5.844865
Iteration 8, loss = 0.17703685
Validation score: -5.213708
Iteration 9, loss = 0.16093987
Validation score: -4.626910
Iteration 10, loss = 0.14599240
Validation score: -4.087209
Iteration 11, loss = 0.13221076
Validation score: -3.587271
Iteration 12, loss = 0.11947398
Validation score: -3.131477
Iteration 13, loss = 0.10788476
Validation score: -2.716740
Iteration 14, loss = 0.09740695
Validation score: -2.349338
Iteration 15, loss = 0.08813822
Validation score: -2.024573
Iteration 16, loss = 0.07985875
Validation score: -1.734244
Iteration 17, loss = 0.07241299
Validation score:

In [16]:
lasso = slm.LassoCV(
    alphas=alphas,
    cv=cvn, 
    verbose=True,
    random_state=1,
    max_iter=100000,
    tol=1e-3,
    n_jobs=-1)

est_ls = lasso.fit(X0_ss,y_train)
results_ls = est_ls.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))
print(results_ls)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
.............................................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done  18 out of  40 | elapsed:  2.5min remaining:  3.0min
...................................................................................................................................................................[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.0min finished


[0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362]


In [17]:
ridge = slm.RidgeCV(
    alphas=[1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4],
    scoring=scoring,
    cv=cvn)

est_rr = ridge.fit(X0_ss,y_train)
results_rr = est_rr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))
print(results_rr)

[[0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]
 [0.64798973]]


In [18]:
regr = ske.BaggingRegressor(base_estimator=svm.SVR(),n_estimators=10,random_state=1,max_samples=40)
regr.fit(X0_ss,y_train)
results_lars = regr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))

  return column_or_1d(y, warn=True)


In [19]:
lars = slm.LarsCV(
    cv=cvn, 
    max_iter=1000,
    max_n_alphas=10000,
    verbose=True,
    normalize=False,
    eps=np.finfo(float).eps,
    n_jobs=-1)

est_lars = lars.fit(X0_ss,y_train)
results_lars = est_lars.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))
print(results_lars)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  40 | elapsed: 46.6min remaining: 56.9min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 48.8min finished


[0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362]


In [20]:
krr_grid = {'kernel': ['linear','rbf'],
          'alpha': [alphas]}

best_params = util.gridsearch_pickparams(skr.KernelRidge(),
                                         cvn,
                                         krr_grid,X0_ss0,
                                         y_train.ravel(),scoring,8)
krr = skr.KernelRidge(kernel=best_params['kernel'],alpha=best_params['alpha'])
krr.fit(X0_ss, y_train)
results_krr = krr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))
print(results_krr)

Fitting 40 folds for each of 2 candidates, totalling 80 fits
[[0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64722443]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64722443]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]
 [0.64723206]]


In [21]:
gsc = slm.ElasticNetCV(
    alphas=alphas,
    cv=cvn, 
    max_iter=10000,
    verbose=True,
    n_jobs=-1)

est_en = gsc.fit(X0_ss,y_train)
results_en = est_en.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))
print(results_en)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
.........................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done  18 out of  40 | elapsed: 44.4min remaining: 54.2min
.......................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 56.3min finished


[0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362
 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362 0.64849362]


In [22]:
# pls_grid = {'n_components': np.flip(np.arange(5,int(len(X_train)))),
#             'scale': [True,False]}

# best_params = util.gridsearch_pickparams(skd.PLSRegression(),cvn,
#                                          pls_grid,X0_ss0,
#                                          y_train.ravel(),scoring,8)
# pls = skd.PLSRegression(n_components=best_params['n_components'],scale=best_params['scale'],max_iter=10000)
# pls.fit(X0_ss, y_train)
# results_pls = (pls.predict(X_test_ss.reshape([X_test_ss.shape[0],
#                                            X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()

# print(results_pls)

In [23]:
pcr = spl.make_pipeline(sdc.PCA(),slm.LinearRegression())
pcr.fit(X0_ss, y_train)
results_pcr = np.asarray(pcr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()
print(results_pcr)

[0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719
 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719 0.64748719]


In [24]:
omp = slm.OrthogonalMatchingPursuitCV(normalize=True,cv=cvn,max_iter=len(X_train)//2,verbose=True)
omp.fit(X0_ss, y_train)
results_omp = np.asarray(omp.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()
print(results_omp)
results_omp = results_pcr

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path
  X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_pat

[0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934
 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934 0.64856934]


In [25]:
rsr = slm.RANSACRegressor(random_state=1,min_samples=len(X0_ss)).fit(X0_ss, y_train)
results_rsr = rsr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]])).ravel()
print(results_rsr)

[0.64939958 0.64939958 0.64939958 0.64939958 0.64940482 0.64939958
 0.64939958 0.64939958 0.64939958 0.64940482 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958
 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958 0.64939958]


In [26]:
# Very slow on leave one out
# ard_grid = {'alpha_1': alphas[-5:-4], 'alpha_2': alphas[-5:-4], 'lambda_1': alphas[-5:-4], 'lambda_2': alphas[-5:-4]}
# best_params = util.gridsearch_pickparams(slm.ARDRegression(),cvn,
#                                          ard_grid,scaler_ss,X_train,
#                                          train_index,X_test,test_index,pre_updrs_off,y_train,scoring,8)
# ard = slm.ARDRegression(alpha_1=best_params['alpha_1'],alpha_2=best_params['alpha_2'],
#                        lambda_1=best_params['lambda_1'],lambda_2=best_params['lambda_2'])
# ard.fit(X0_ss,y_train)
# results_ard = np.asarray(ard.predict(X_test_ss.reshape([X_test_ss.shape[0],
#                                            X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()

In [27]:
svr_grid = {'kernel': ['linear','rbf'],
          'epsilon': [1e-1,1.5e-1,2.5e-1],
          'C': [1e0,1e1,1e2]}
best_params = util.gridsearch_pickparams(svm.SVR(),
                                         cvn,
                                         svr_grid,X0_ss0,
                                         y_train.ravel(),scoring,8)
svr = svm.SVR(kernel=best_params['kernel'],epsilon=best_params['epsilon'])
svr.fit(X0_ss, y_train)
results_svr = np.asarray(svr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()
print(results_svr)

Fitting 40 folds for each of 18 candidates, totalling 720 fits


ERROR:concurrent.futures:exception calling callback for <Future at 0x7f93c67b4dd0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/pdradenv/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/ali/anaconda3/envs/pdradenv/lib/python3.7/site-packages/joblib/parallel.py", line 360, in __call__
    self.parallel.dispatch_next()
  File "/home/ali/anaconda3/envs/pdradenv/lib/python3.7/site-packages/joblib/parallel.py", line 797, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/home/ali/anaconda3/envs/pdradenv/lib/python3.7/site-packages/joblib/parallel.py", line 864, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/ali/anaconda3/envs/pdradenv/lib/python3.7/site-packages/joblib/parallel.py", line 782, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/home/ali/anaconda3/envs/pdraden

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
gbr_grid = {'max_depth':[3,6,9,12,15,20,100]}
best_params = util.gridsearch_pickparams(ske.GradientBoostingRegressor(random_state=1),cvn,
                                         gbr_grid,X0_ss0,
                                         y_train.ravel(),scoring,8)
gbr = ske.GradientBoostingRegressor(random_state=1,learning_rate=0.001,max_depth=best_params['max_depth'],n_estimators=100)
gbr.fit(X0_ss, y_train)
results_gbr = np.asarray(gbr.predict(X_test_ss.reshape([X_test_ss.shape[0],
                                           X_test_ss.shape[1]*X_test_ss.shape[2]]))).ravel()

In [None]:
results_lr.shape

In [None]:
r_max =util.eval_prediction(np.vstack((pre_imp[test_index].T,
                               results_lr.T,
                               results_mlp,
                               results_ls,
                               results_lars,
                               results_en,
                               results_rr.ravel(),
                               results_krr.ravel(),
                               results_pcr,
                               #results_pls,
                               results_omp,
                               results_br,
                               results_rsr,
                               results_svr,
                               results_gbr)),
                               y_test.T,
                               ['LCT','Regression','MLP','Lasso','BaggingRegressor',
                                'ElasticNet','Ridge','KernelRidge','PCR',
                                #'PLS',
                                'OMP','Bayesian','RANSAC','SVR','GBR'],(70,5))
plt.ylim([0,2])
plt.xlim([0,2])

In [None]:
density_train = stats.gaussian_kde(y_train)
density_test = stats.gaussian_kde(y_test)
n, x, _ = plt.hist(y_train, bins=np.linspace(-1,2,50), 
                   histtype=u'step', density=True,color='tab:blue',linewidth=0)  
plt.plot(x, density_train(x),color='tab:blue',label=r'$y_{train}$')
n, y, _ = plt.hist(y_test, bins=np.linspace(-1,2,50), 
                   histtype=u'step', density=True,color='tab:blue',linewidth=0)  
plt.plot(y, density_test(y),color='tab:green',label=r'$y_{test}$')
plt.legend(fontsize=24)
plt.xlim([-1,2])
plt.ylim([0,5])
plt.xlabel('DBS improvement',fontsize=24)
plt.ylabel('Frequency',fontsize=24)
plt.title('Dataset distributions',fontsize=24)
plt.show()

In [None]:
p,q = util.make_pdfs(X0_ss.ravel(),X_test_ss0.ravel(),1e3)
util.kl_divergence(p[abs(p)>1e-16],q[abs(p)>1e-16])

In [None]:
r_max