In [1]:
# Import libraries
import numpy as np
import util
import copy
import pandas as pd
import cv2
import os
from string import digits
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import get_column_plot

### Synthetic Data Vault workflow
<p align="center">
<img src="./svd_wf.PNG" width=1000 height=400/>
</p>

In [2]:
# Get case IDs
case_list = open('/home/ali/RadDBS-QSM/data/docs/cases_90','r')
lines = case_list.read()
lists = np.loadtxt(case_list.name,comments="#", delimiter=",",unpack=False,dtype=str)
case_id = []
for lines in lists:     
    case_id.append(lines[-9:-7])

# Load scores
file_dir = '/home/ali/RadDBS-QSM/data/docs/QSM anonymus- 6.22.2023-1528_wldd.csv'
motor_df = util.filter_scores(file_dir,'pre-dbs updrs','stim','pre op levadopa equivalent dose (mg)','CORNELL ID')
# Find cases with all required scores
subs,pre_imp,post_imp,pre_updrs_off,ledd = util.get_full_cases(motor_df,
                                                          'CORNELL ID',
                                                          'OFF (pre-dbs updrs)',
                                                          'ON (pre-dbs updrs)',
                                                          'OFF meds ON stim 6mo',
                                                          'pre op levadopa equivalent dose (mg)')

# Load extracted features
npy_dir = '/home/ali/RadDBS-QSM/data/npy/'
phi_dir = '/home/ali/RadDBS-QSM/data/phi/phi/'
roi_path = '/data/Ali/atlas/mcgill_pd_atlas/PD25-subcortical-labels.csv'
n_rois = 6
all_rois = False
Phi_all, X_all, R_all, K_all, ID_all = util.load_featstruct(phi_dir,npy_dir+'X/',npy_dir+'R/',npy_dir+'K/',n_rois,1595,all_rois)
ids = np.asarray(ID_all).astype(int)
# Find overlap between scored subjects and feature extraction cases
c_cases = np.intersect1d(np.asarray(case_id).astype(int),np.asarray(subs).astype(int))
# Complete case indices with respect to feature matrix
c_cases_idx = np.in1d(ids,c_cases)
X_all = X_all[:,:,:]
K_all = K_all[:,:,:]
R_all = R_all[:,:,:]
R = np.random.randint(low=0,high=1e6,size=(K_all.shape))
K_all = np.char.add(K_all,R.astype(str))
X_all_c, K, R, subsc, pre_imp, pre_updrs_off, per_change, ledd = util.re_index(X_all,K_all,R_all,c_cases_idx,subs,ids,all_rois,pre_imp,pre_updrs_off,post_imp,ledd)


### Generative model
For table $T \ [=] \ m \times n$ with $m$ patients and $n$ features, want to uncover model $\Phi_{\Sigma}[\Phi^{-1}(F_i(X_i))]$ where each column $X_i$ where $i \ \in \ [1,...,n]$ in table $T$ is modeled by a Guassian copula $\Phi^{-1}(F_i(X_i))$ \
At each column $i$, a probability distribution over the feature can be modeled with covariance $\Sigma$ describing the effect of feature $i$ on feature $i'$. \
The shape of the distribution of each feature is described by the cumulative distribution function (CDF) $F_i$. Possible distributions are: 
* Gaussian $\mathcal{N}(\mu,\sigma^2)$
* Truncated Gaussian $\mathcal{N}(\mu,\sigma^2,g_{min},g_{max})$
* Uniform $\mathcal{U}(g_{min},g_{max})$
* Beta $\mathcal{B}(\alpha,\beta)$
* Exponential $\mathcal{E}(\lambda)$

The Kolmogorov-Smirnov test can test the choice of distribution fit. \
Since the covariance is biased by choice of distribution, multivariate Gaussian copulas are used to determine inter-feature effects

### Database modeling
<p align="center">
<img src="./sdv_cpa.PNG" width=1000 height=300/>
</p>

### Gaussian copula
The probability integral transform enables any sample of random variables drawn from a continuous distribution to be expressed in terms of random variables with a standard uniform distribution, $\mathcal{U}(0,1)$. A Gaussian copula arises from this transform as a distribution over the unit hypercube $[0,1]^d$ from a multivariate normal distribution $\in \ \mathbb{R}^d$. For any correlation matrix $R \ in \ [-1,1]^{d \times d}$, the Gaussian copula $C$ is

$$C_{R} = \Phi_R(\Phi^{-1}(u_1),...,\Phi^{-1}(u_d))$$

Where $\Phi_R$ is the joint cumulative distribution function of multivariate $\mathcal{N}(0,R)$ and $\Phi^{-1}$ is its inverse. The copula can be expanded: 

$$C_{R}(u)=\frac{1}{\sqrt{\mathrm{det}(R)}}e^{-\frac{1}{2}\left(\Phi^{-1}(\vec{u})^T\right)\cdot(R^{-1}-I)\left(\Phi^{-1}(\vec{u})\right)}$$

From the <i>Generative model</i> section, let $Y = \Phi^{-1}(F_i(X_i))$ so that 

$$Y = [\Phi^{-1}(F_0(X_0)) \ ... \ \Phi^{-1}(F_n(X_n))]$$

Then covariance $\Sigma$ can be computed over the transformed values in the table

### Sampling
For a table $T$ with CDFs $F$ and covariance $\Sigma$:
* Find the Cholesky decomposition $LL^T=\Sigma$ 
* Sample $v \ [=] \ n \times 1$
* Apply transformation $u=Lv$ in copula space
* Return to table space $x = [F^{-1}_0(\Phi(u_0)) \ ... \ F^{-1}_n(\Phi(u_n))]$

In [19]:
T = np.append(X_all_c.reshape(X_all_c.shape[0],-1),pre_updrs_off.reshape(-1,1),axis=1)
T = np.append(T,ledd.reshape(-1,1),axis=1)
Kt = np.append(K,['per_change'])
Kt = np.insert(Kt,0,['id'])
T = np.append(T,per_change.reshape(-1,1),axis=1)
T = np.insert(T,0,subsc,axis=1)
for j in np.arange(T.shape[1]):
    if np.var(T[:,j]) == 0:
        print('Constant entries at '+Kt[j])
T = pd.DataFrame(data=T,columns=Kt)
T.astype({'id': 'int32'}).dtypes

Constant entries at exponential_firstorder_Uniformit981112
Constant entries at exponential_glcm_Autocorrelation721117
Constant entries at exponential_glcm_ClusterProminen711668
Constant entries at exponential_glcm_ClusterShade550751
Constant entries at exponential_glcm_ClusterTendency411026
Constant entries at exponential_glcm_Contrast685026
Constant entries at exponential_glcm_Correlation468298
Constant entries at exponential_glcm_DifferenceAvera502337
Constant entries at exponential_glcm_DifferenceEntro400902
Constant entries at exponential_glcm_DifferenceVaria842049
Constant entries at exponential_glcm_Id699131
Constant entries at exponential_glcm_Idm529654
Constant entries at exponential_glcm_Idmn704958
Constant entries at exponential_glcm_Idn108588
Constant entries at exponential_glcm_Imc1350709
Constant entries at exponential_glcm_Imc2476138
Constant entries at exponential_glcm_InverseVariance811856
Constant entries at exponential_glcm_JointAverage916089
Constant entries at expon

id                                        int32
original_shape_Elongation42762          float64
original_shape_Flatness823231           float64
original_shape_LeastAxisLength810740    float64
original_shape_MajorAxisLength631170    float64
                                         ...   
wavelet-LLL_ngtdm_Contrast549248        float64
wavelet-LLL_ngtdm_Strength479254        float64
pre_updrs                               float64
ledd                                    float64
per_change                              float64
Length: 9574, dtype: object

In [22]:
resample = 0
metadata = SingleTableMetadata()
if resample == 1:
    synthesizer = SingleTablePreset(metadata, name='FAST_ML')
    synthesizer.fit(data=T)
    Ts = synthesizer.sample(47)
    Ts.to_pickle('Ts_df') 
    Xs = Ts.to_numpy()
    np.save('Xs.npy',Xs)
else:
    Xs = np.load('Xs.npy')
    Ts = pd.read_pickle('Ts_df')
    T.columns = Ts.columns
    metadata.detect_from_dataframe(Ts)
    metadata.update_column(
        column_name='id',
        sdtype='id')
    metadata.set_primary_key(
        column_name='id')
    metadata.validate()

In [23]:
Ts

Unnamed: 0,id,original_shape_Elongation851329,original_shape_Flatness101531,original_shape_LeastAxisLength360161,original_shape_MajorAxisLength200615,original_shape_Maximum2DDiameter443053,original_shape_Maximum2DDiameter328842,original_shape_Maximum2DDiameter990709,original_shape_Maximum3DDiameter626442,original_shape_MeshVolume466256,...,wavelet-LLL_glszm_ZonePercentage996115,wavelet-LLL_glszm_ZoneVariance877251,wavelet-LLL_ngtdm_Busyness398725,wavelet-LLL_ngtdm_Coarseness88272,wavelet-LLL_ngtdm_Complexity574527,wavelet-LLL_ngtdm_Contrast911559,wavelet-LLL_ngtdm_Strength669181,pre_updrs,ledd,per_change
0,0.0,0.767864,0.685151,6.067052,8.767121,8.698159,8.621633,10.638923,10.844383,250.654435,...,0.285675,238.430693,0.029076,0.017675,872.471144,0.127636,11.165496,57.330922,1241.712778,0.841413
1,1.0,0.880267,0.687223,5.25384,7.680813,8.833452,8.077747,9.187254,9.513149,176.219013,...,0.348451,241.636983,0.244806,0.023918,1193.289156,0.166743,17.564662,42.989462,605.215158,0.707475
2,2.0,0.767467,0.634389,6.270128,9.769491,9.341287,10.302373,12.240797,11.732644,311.583653,...,0.251573,1.16191,0.029076,0.026952,694.797398,0.170084,8.558438,20.979911,1323.213063,0.191737
3,3.0,0.817668,0.647136,5.653218,8.602779,9.58212,8.888731,10.248737,10.821839,241.787538,...,0.258378,615.76281,0.251322,0.020232,768.222729,0.163478,10.920884,35.488737,1496.626763,0.1677
4,4.0,0.789471,0.599951,5.137487,8.680911,8.684956,8.749415,9.848921,10.684362,198.088876,...,0.173445,522.582531,0.230548,0.021787,371.466708,0.109503,9.352052,40.334784,1113.433088,0.742425
5,5.0,0.745132,0.698034,6.603383,9.374389,8.705614,10.524868,10.727974,11.377603,304.078155,...,0.097888,499.081593,0.370325,0.020703,78.689209,0.083671,3.784343,49.799171,341.353246,0.717646
6,6.0,0.849649,0.737956,6.086823,8.039902,8.633612,8.714795,9.641888,10.062488,237.210473,...,0.224262,589.604023,0.334624,0.021497,462.132895,0.106052,6.592694,44.655246,736.946536,0.298405
7,7.0,0.759709,0.580571,5.570261,9.593971,9.480401,8.981882,11.138728,11.665436,263.607468,...,0.329322,1.16191,0.13101,0.027378,1410.804916,0.180434,16.08924,58.114106,930.42722,0.617245
8,8.0,0.825934,0.693463,5.852427,8.344645,9.53766,8.569047,9.530458,10.155201,233.142178,...,0.304429,9.404133,0.36245,0.022239,1072.24875,0.241202,12.441647,50.343533,878.193397,0.967742
9,9.0,0.691493,0.622254,5.767194,9.507308,7.925378,9.207196,11.038344,11.620432,236.668342,...,0.4095,1.16191,0.264207,0.02757,1869.126654,0.178979,21.525095,79.126895,276.264309,0.752995


In [24]:
rewrite = 0
if rewrite == 1:
    for j in np.arange(1,Kt.shape[0]):
        try:
            fig = get_column_plot(
                real_data=T,
                synthetic_data=Ts,
                column_name=Ts.columns[j],
                metadata=metadata,
            )
            fig.write_image("./distributions/fig"+str(j)+".jpeg",width=1000, height=500)
        except:
            print('Singular covariance matrix at feature',Kt[j])

In [25]:
fig = get_column_plot(
    real_data=T,
    synthetic_data=Ts,
    column_name=Ts.columns[10],
    metadata=metadata,
)

fig.show()

In [26]:
image_folder = 'distributions'
video_name = 'dists.avi'

images = [img for img in os.listdir(image_folder) if img.endswith(".jpeg")]
frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape
video = cv2.VideoWriter(video_name, 0, 10, (width,height))

for image in images:
    video.write(cv2.imread(os.path.join(image_folder, image)))

cv2.destroyAllWindows()
video.release()