In [1]:
# Import libraries

import matplotlib.pyplot as plt
import pandas as pd
import warnings
from econml.sklearn_extensions.linear_model import WeightedLassoCV
import sklearn.model_selection as sms
import sklearn.linear_model as slm
import sklearn.preprocessing as skp
import sklearn.metrics as sme
import sklearn.feature_selection as skf
import sklearn.ensemble as ske
import sklearn.utils as sku
import sklearn.decomposition as skd
import sklearn.neural_network as skn
from celer import GroupLassoCV
from sklearnex import patch_sklearn, config_context
from sklearn.cluster import DBSCAN
import numpy as np
import scipy.stats as stats
from IPython.display import HTML
import util
from scipy.spatial import cKDTree
import nibabel as nib
import os
import pickle
from torch import nn
from vae_feats import VAE
from vae_feats import train_model
from pyearth import Earth
patch_sklearn()


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
HTML('''
<style>
.jupyter-matplotlib {
    background-color: #000;
}

.widget-label, .jupyter-matplotlib-header{
    color: #fff;
}

.jupyter-button {
    background-color: #333;
    color: #fff;
}
</style>
''')

In [3]:
# Maybe y should also be scaled? 
# Perhaps a transform would be more effective, or scaling implemented with consistent cross-validation
# Different scaling methods? 
#     This seems most important for noise-sensitive models like LARS. All other use StandardScaler()
# Transformers?
# Model-specific scaling methods?
#     Yes, see above
# Common cross-validation function ✓
#     Use built-in functions wherever possible and `utils.gridsearch_pickparams()` elsewhere
# Quantile loss
# RANSAC
# Data augmentation? (Mixup)
# Data generation? (SMOGN)
# Combine CHH dataset ✓
# Implement CV and test ✓
# Print selected features ✓
# Make magnitude templates
# Sample weights ✓
# Look at segmentations by error ✓ (Appears to have most difference in red nucleus, which includes surrounding (white?) matter for underperforming cases)
# Extract features from current (1:6) eroded ROIs
# Extract features from all ROIs
# Plot segmentation variance against error for each case across all ROIs ✓
# Why does excluding the subthalamic nucleus increase the correlation (r=0.5 -> r=0.6)?
# Best performance with all ROIs: cvn=6, k=1800
# Best performance with ROIs 0:4, excluding STN: cvn=6, k=1800
# Should the pre-operative UPDRS be appended once or to each ROI? ✓
# Plot histogram of features for successful and unsuccessful predictions

In [4]:
# Get case IDs
case_list = open('/home/ali/RadDBS-QSM/data/docs/cases_90','r')
lines = case_list.read()
lists = np.loadtxt(case_list.name,comments="#", delimiter=",",unpack=False,dtype=str)
case_id = []
for lines in lists:     
    case_id.append(lines[-9:-7])

# Load scores
file_dir = '/home/ali/RadDBS-QSM/data/docs/QSM anonymus- 6.22.2023-1528.csv'
motor_df = util.filter_scores(file_dir,'pre-dbs updrs','stim','CORNELL ID')
# Find cases with all required scores
subs,pre_imp,post_imp,pre_updrs_off = util.get_full_cases(motor_df,
                                                          'CORNELL ID',
                                                          'OFF (pre-dbs updrs)',
                                                          'ON (pre-dbs updrs)',
                                                          'OFF meds ON stim 6mo')
# Load extracted features
npy_dir = '/home/ali/RadDBS-QSM/data/npy/'
phi_dir = '/home/ali/RadDBS-QSM/data/phi/phi/'
roi_path = '/data/Ali/atlas/mcgill_pd_atlas/PD25-subcortical-labels.csv'
n_rois = 6
all_rois = False
Phi_all, X_all, R_all, K_all, ID_all = util.load_featstruct(phi_dir,npy_dir+'X/',npy_dir+'R/',npy_dir+'K/',n_rois,1595,all_rois)

ids = np.asarray(ID_all).astype(int)
# Find overlap between scored subjects and feature extraction cases
c_cases = np.intersect1d(np.asarray(case_id).astype(int),np.asarray(subs).astype(int))
# Complete case indices with respect to feature matrix
c_cases_idx = np.in1d(ids,c_cases)
X_all_c = X_all[c_cases_idx,:,:]
K_all_c = K_all[c_cases_idx,:,:]
R_all_c = R_all[c_cases_idx,:,:]
print(R_all_c)
# Re-index the scored subjects with respect to complete cases
s_cases_idx = np.in1d(subs,ids[c_cases_idx])
subsc = subs[s_cases_idx]
pre_imp = pre_imp[s_cases_idx]
post_imp = post_imp[s_cases_idx]
pre_updrs_off = pre_updrs_off[s_cases_idx]
per_change = post_imp
# Reshape keys and ROIs
if all_rois == True:
    K_all_cu = np.empty((K_all_c.shape[0],K_all_c.shape[1],K_all_c.shape[2]+1),dtype=object)
    K_all_cu[:,:,:-1] = K_all_c
    K_all_cu[:,:,-1] = 'pre_updrs'
    K = K_all_cu.reshape((K_all_cu.shape[0],K_all_cu.shape[1]*K_all_cu.shape[2]))[0]
    R = R_all_c.reshape((R_all_c.shape[0],R_all_c.shape[1]*R_all_c.shape[2]))
else:
    K = K_all_c.reshape((K_all_c.shape[0],K_all_c.shape[1]*K_all_c.shape[2]))[0]
    K = np.append(K,['pre_updrs'],0)
    R = R_all_c.reshape((R_all_c.shape[0],R_all_c.shape[1]*R_all_c.shape[2]))


Allocated arrays
Created feature matrix
Created ROI matrix
Created feature label matrix
[[['Left red nucleus' 'Left red nucleus' 'Left red nucleus' ...
   'Left red nucleus' 'Left red nucleus' 'Left red nucleus']
  ['Right red nucleus' 'Right red nucleus' 'Right red nucleus' ...
   'Right red nucleus' 'Right red nucleus' 'Right red nucleus']
  ['Left substantia nigra' 'Left substantia nigra'
   'Left substantia nigra' ... 'Left substantia nigra'
   'Left substantia nigra' 'Left substantia nigra']
  ['Right Substantia nigra' 'Right Substantia nigra'
   'Right Substantia nigra' ... 'Right Substantia nigra'
   'Right Substantia nigra' 'Right Substantia nigra']
  ['Left subthalamic nucleus' 'Left subthalamic nucleus'
   'Left subthalamic nucleus' ... 'Left subthalamic nucleus'
   'Left subthalamic nucleus' 'Left subthalamic nucleus']
  ['Right subthalamic nucleus' 'Right subthalamic nucleus'
   'Right subthalamic nucleus' ... 'Right subthalamic nucleus'
   'Right subthalamic nucleus' 'Righ

In [5]:
# # Augment with CHH data
# X0_gt = np.load('/home/ali/RadDBS-QSM/data/npy/old/X0_gt_chh_rois.npy')
# df = pd.read_csv('/home/ali/RadDBS-QSM/data/xlxs/updrs_iii_chh.csv')
# # Patient IDs
# subject_id = np.asarray(df[df.columns[0]])[1:]
# # Data
# s_directory = open('/home/ali/RadDBS-QSM/data/roi/roi_list','r').read().splitlines()
# # Load
# with open('/home/ali/RadDBS-QSM/data/pickles/segs_chh', "rb") as fp:  
#     segs = pickle.load(fp)
#     n_cases = len(segs)
# with open('/home/ali/RadDBS-QSM/data/pickles/qsms_chh', "rb") as fp:  
#     qsms = pickle.load(fp)
# with open('/home/ali/RadDBS-QSM/data/phi/chh/Phi_mcl_gt_roi_chh', "rb") as fp:  
#         Phi_gt = pickle.load(fp)
# L = int(len(X0_gt)/n_cases)
# n_features = int(L/n_rois)
# # Only extract ROI if it is present in all cases
# seg_labels_all = segs[0]
# case_number = np.zeros_like(np.asarray(s_directory))
# for i in range(n_cases):
#     case_number[i] = float(s_directory[i][-2:])
# subject_id_corr = subject_id[np.in1d(subject_id,case_number)]
# for i in range(n_cases):
#     #try:
#         print('Found ROIs',str(np.unique(segs[i])),'at segmentation directory file',s_directory[i],'for case',str(subject_id_corr[i]))
#     #except:
#         print('Case',subject_id[i],'quarantined')
# pre_updrs_iii_off =  np.asarray(df[df.columns[3]][np.hstack((False,np.in1d(subject_id,subject_id_corr)))])                                
# pre_updrs_iii_on =  np.asarray(df[df.columns[4]][np.hstack((False,np.in1d(subject_id,subject_id_corr)))])
# post_updrs_iii_off =  np.asarray(df[df.columns[6]][np.hstack((False,np.in1d(subject_id,subject_id_corr)))])

# per_change = np.hstack((per_change,(np.asarray(pre_updrs_iii_off).astype(float)-np.asarray(post_updrs_iii_off).astype(float))/(np.asarray(pre_updrs_iii_off).astype(float))))
# pre_updrs_off = np.hstack((pre_updrs_off, pre_updrs_iii_off))
# X0_gt = X0_gt.reshape((n_cases,n_rois,n_features))[:,0:4,:]
# X_all_c = np.vstack((X_all_c,X0_gt[:,:,:-1]))
# lct_change = (np.asarray(pre_updrs_iii_off).astype(float)-(np.asarray(pre_updrs_iii_on)).astype(float))/(np.asarray(pre_updrs_iii_off).astype(float))
# pre_imp = np.hstack((pre_imp,lct_change))
# subject_id_corr=subject_id_corr+100
# subsc = np.hstack((subsc,subject_id_corr))

In [6]:
# nii_dir = '/home/ali/RadDBS-QSM/data/nii'
# qsms = []
# segs = []
# for j in np.arange(len(subsc)):
#     if subsc[j] < 10:
#         qsms.append(nii_dir+'/qsm/QSM_e10_imaginary_0'+str(int(subsc[j]))+'.nii.gz')
#         segs.append(nii_dir+'/seg/labels_2iMag0'+str(int(subsc[j]))+'.nii.gz')
#     else:
#         qsms.append(nii_dir+'/qsm/QSM_e10_imaginary_'+str(int(subsc[j]))+'.nii.gz')
#         segs.append(nii_dir+'/seg/labels_2iMag'+str(int(subsc[j]))+'.nii.gz')

# V, M, subs_err = util.roi_var(qsms,segs,[1,2,3,4,5,6])
# np.save('V.npy',V)
# np.save('U.npy',M)
# V = np.load('V.npy')
# M = np.load('U.npy')

In [7]:
scoring = 'r2'
results_bls = np.zeros_like(per_change)
results_ls = np.zeros_like(per_change)
results_gls = np.zeros_like(per_change)
gerror = np.zeros_like(per_change)
alphas = np.logspace(-4,4,100)
Ks = []
Kstg = []
w = []
wg = []

In [None]:
for j in np.arange(len(subsc)):
    test_id = subsc[j]
    test_index = subsc == test_id
    train_index = subsc != test_id
    X_train = X_all_c[train_index,:,:]
    X_test = X_all_c[test_index,:,:]
    y_train = per_change[train_index]
    y_test = per_change[test_index]
    cvn = 6
    X0_ss0,scaler_ss,X_test_ss0 = util.model_scale(skp.StandardScaler(),
                                                X_train,train_index,X_test,test_index,pre_updrs_off,False)
    with np.errstate(divide='ignore', invalid='ignore'):
      # Feature selection
      sel = skf.SelectKBest(skf.r_regression,k=1800)
      model = Earth(max_degree=3,use_fast=True,smooth=True)
      model.fit(X0_ss0,y_train)
      X0m_ss = model.transform(X0_ss0)
      print(model.predict(X_test_ss0))
      X0_ss = sel.fit_transform(np.append(X0_ss0,X0m_ss,axis=1),y_train)
      X_test_ss = sel.transform(np.append(X_test_ss0,model.transform(X_test_ss0),axis=1))

      # for k in np.arange(X0m_ss.shape[1]):
      #    lr = stats.linregress(X0m_ss[:,k],y_train)
      #    if lr.rvalue>0.4:
      #       print(lr.rvalue)
      #X0_ss = np.append(X0_ss,X0m_ss,axis=1)
      #X_test_ss = np.append(X_test_ss,model.transform(X_test_ss),axis=1)
      #Ks.append(sel.transform(K.reshape(1, -1)))
      y_n = cKDTree(X0_ss).query(X_test_ss, k=2)[1]

    # LASSO
    lasso = slm.LassoCV(max_iter=1e4,cv=cvn,n_jobs=-1)
    est_ls = lasso.fit(X0_ss,y_train)
    results_ls[j] = est_ls.predict(X_test_ss)
    w.append(est_ls.coef_)
    print('Lasso predicts',str(np.round(results_ls[j],2)),
          'for case with',str(np.round(per_change[j],2)),
          'with regularization',str(np.round(est_ls.alpha_,4)),
          'and neighbor ',str(y_train[y_n]))
 

[0.6012479]
Lasso predicts 0.44 for case with 0.48 with regularization 0.0001 and neighbor  [[0.08571429 0.14705882]]
[0.53739469]
Lasso predicts 0.54 for case with 0.97 with regularization 0.0001 and neighbor  [[0.87878788 0.52272727]]
[0.30615687]
Lasso predicts 0.57 for case with 0.75 with regularization 0.0001 and neighbor  [[0.1875     0.14705882]]
[0.42403041]
Lasso predicts 0.67 for case with 0.66 with regularization 0.0001 and neighbor  [[0.83928571 0.93939394]]
[-0.57187594]
Lasso predicts 0.83 for case with 0.74 with regularization 0.0046 and neighbor  [[0.88888889 0.68656716]]
[0.54934421]
Lasso predicts 0.45 for case with 0.15 with regularization 0.0001 and neighbor  [[0.67241379 0.55555556]]
[0.1000872]
Lasso predicts 0.85 for case with 0.85 with regularization 0.0001 and neighbor  [[0.69135802 0.51724138]]
[0.60331039]
Lasso predicts 0.38 for case with 0.53 with regularization 0.0001 and neighbor  [[0.42857143 0.30232558]]
[0.53084202]
Lasso predicts 0.63 for case with 0.

In [None]:
util.eval_prediction(np.vstack((pre_imp,
                               results_ls,
                               )),
                               per_change,
                               ['LCT',
                                'Lasso',
                                ],(30,5))
plt.ylim([0,2])
plt.xlim([0,2])
plt.style.use('default')