In [None]:
# Import libraries
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import nibabel as nib
from sklearn.svm import SVR
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import r2_score
import SimpleITK as sitk
import six
from radiomics import featureextractor 
import numpy as np
import os
import pickle
import pandas as pd
import logging
from scipy.stats import linregress
import smogn
import pandas
from collections import Counter
from multiprocessing import Pool
from notebook import notebookapp
from numpy import matlib
import random
from scipy import ndimage
from util import pyvis
from util import extract
from loader import data_loader
from IPython.display import HTML
import util
import smogn
from smogn.phi import phi
from smogn.phi_ctrl_pts import phi_ctrl_pts
import warnings

In [None]:
HTML('''
<style>
.jupyter-matplotlib {
    background-color: #000;
}

.widget-label, .jupyter-matplotlib-header{
    color: #fff;
}

.jupyter-button {
    background-color: #333;
    color: #fff;
}
</style>
''')

In [None]:
# Load data
reload = 0
suffix = '90'
segs, qsms, n_cases, case_list = data_loader('/media/mts_dbs/dbs/all/nii/qsm/',
                                             '/media/mts_dbs/dbs/all/nii/seg/',
                                             reload,suffix,'QSM_e10_imaginary_')

# Get case IDs
lines = case_list.read()
lists = np.loadtxt(case_list.name,comments="#", delimiter=",",unpack=False,dtype=str)
case_id = []
for lines in lists:     
    case_id.append(lines[-9:-7])

# Load scores
file_dir = '/data/Ali/RadDBS-QSM/src/csv/QSM anonymus- 6.22.2023-1528.csv'
motor_df = util.filter_scores(file_dir,'pre-dbs updrs','stim','CORNELL ID')
# Find cases with all required scores
subs,pre_imp,post_imp,pre_updrs_off = util.get_full_cases(motor_df,
                                                          'CORNELL ID',
                                                          'OFF (pre-dbs updrs)',
                                                          'ON (pre-dbs updrs)',
                                                          'OFF meds ON stim 6mo')
# Load extracted features
npy_dir = '/media/mts_dbs/dbs/all/npy/'
phi_dir = '/media/mts_dbs/dbs/all/phi/'
roi_path = '/data/Ali/atlas/mcgill_pd_atlas/PD25-subcortical-labels.csv'
n_rois = 6
Phi_all, X_all, R_all, K_all, ID_all = util.load_featstruct(phi_dir,npy_dir+'X/',npy_dir+'R/',npy_dir+'K/',n_rois,1595)
ids = np.asarray(ID_all).astype(int)
# Find overlap between scored subjects and feature extraction cases
c_cases = np.intersect1d(np.asarray(case_id).astype(int),np.asarray(subs).astype(int))
# Complete case indices with respect to feature matrix
c_cases_idx = np.in1d(ids,c_cases)
X_all_c = X_all[c_cases_idx,:,:]
K_all_c = K_all[c_cases_idx,:,:]
R_all_c = R_all[c_cases_idx,:,:]
# Re-index the scored subjects with respect to complete cases
s_cases_idx = np.in1d(subs,ids[c_cases_idx])
pre_imp = pre_imp[s_cases_idx]
post_imp = post_imp[s_cases_idx]
pre_updrs_off = pre_updrs_off[s_cases_idx]
per_change = post_imp

The Unified Parkinson's Disease Rating Scale (UPDRS) is divided into 4 parts. Part 3, UPDRS-III is motor symptoms and ranges from $[0-132]^1$. From real data, construct a set of simulated data spanning this range. The most correlated radiomic feature will be used to simulate a range of inputs.

In [None]:
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
#
X_t,oscaler = util.make_feature_matrix(X_all_c,pre_updrs_off)
y = per_change



In [None]:
Js = []
for j in np.arange(X_t.shape[1]):
    plt.rcParams["figure.figsize"] = (25,5)
    # Cross validation results
    [fig,ax] = plt.subplots(sharex=True, sharey=True)
    lr_prepost = linregress(X_t[:,j],per_change)
    
    if lr_prepost.rvalue>0.4:
        Js.append(j)
        print('Feature',str(j),K_all_c[1,1,j],'in ROI',R_all_c[1,1,j],'has correlation',str(lr_prepost.rvalue))
        # plt.scatter(X_t[:,j],per_change,)
        # plt.text(0.05, 0.95, text,transform=ax.transAxes,
        # fontsize=14, verticalalignment='top')
        # plt.plot(X_t[:,j],X_t[:,j]*lr_prepost.slope+lr_prepost.intercept,'-r')
        # plt.title(['Feature',str(j)])
        # plt.ylabel("DBS improvement")
        # plt.xlabel("Feature")
        # text = f"$y={lr_prepost.slope:0.3f}\;x{lr_prepost.intercept:+0.3f}$\n$r = {lr_prepost.rvalue:0.3f}$\n$p = {lr_prepost.pvalue:0.3f}$"
        # plt.show()


In [None]:
yo = np.min(per_change)
yu = np.mean(per_change)
Rmo = 1
Rmu = 0
# Create data frame for SMOGN generation
n_cases = len(y)
D = pd.DataFrame(np.hstack((X_t,(np.asarray(y).reshape(n_cases,1)))))
for col in D.columns:
    D.rename(columns={col:str(col)},inplace=True)
# Specify phi relevance values
Rm = [[yo,  Rmo,    0],  
        [yu,  Rmu,    0]]
d = len(D.columns)
yi = pd.DataFrame(D[str(d-1)])
# Pre-index targets
idx = pd.Index((yi.values).ravel())
# Get sorted indices
idx = idx.sort_values(return_indexer=True)
# Sort targets in ascending order
y_sort = yi.sort_values(by=str(d-1))
y_sort = y_sort[str(d-1)]
# Generate relevance function
phi_params = phi_ctrl_pts(y = y_sort,
    method = 'manual',                                
    ctrl_pts = Rm                                      
)
y_phi = phi(y = y_sort,              
ctrl_pts = phi_params 
)
# Verify sample size reduction using default threshold
t = 0.5
N_us = np.sum(np.asarray(y_phi)>t)
idx_kept = (np.asarray(y_phi)<=t)*(idx[1]+1) > 0
# Conduct SMOGN
print('Prior to SMOGN sampling, mean is',X_t.mean(),'standard deviation is',X_t.std())
X_smogn = smogn.smoter(data = D, y = str(D.columns[-1]),rel_method='manual',rel_ctrl_pts_rg = Rm,pert=0.02)
X_smogn = np.asarray(X_smogn)
# if np.sum(np.sqrt((X_t[idx_kept,:]-X_smogn[:,:-1]))**2) < 1e-16:
#     print('Synthetic data and input data are identical')
# if X_t.shape[0]-X_smogn.shape[0] == N_us:
#     print('New dataset size verified')
print('After SMOGN sampling, mean is',X_smogn[:,:-1].mean(),'standard deviation is',X_smogn[:,:-1].std())
y_smogn = X_smogn[:,-1]
sscaler = StandardScaler()
X_smogn = sscaler.fit_transform(X_smogn[:,:-1])
print('After rescaling, SMOGN mean is',X_smogn[:,:-1].mean(),'standard deviation is',X_smogn[:,:-1].std())

[1] R. Balestrino, “Applications of the European Parkinson’s Disease Association sponsored Parkinson’s Disease Composite Scale (PDCS)”, npj Parkinson's Disease, vol. 5, no. 1, 2019, doi: 10.1038/s41531-019-0097-1.