In [1]:
# Import libraries
import numpy as np
import util
import copy
import pandas as pd

### Synthetic Data Vault workflow
<p align="center">
<img src="./svd_wf.PNG" width=1000 height=400/>
</p>

In [2]:
# Get case IDs
case_list = open('/home/ali/RadDBS-QSM/data/docs/cases_90','r')
lines = case_list.read()
lists = np.loadtxt(case_list.name,comments="#", delimiter=",",unpack=False,dtype=str)
case_id = []
for lines in lists:     
    case_id.append(lines[-9:-7])

# Load scores
file_dir = '/home/ali/RadDBS-QSM/data/docs/QSM anonymus- 6.22.2023-1528_wldd.csv'
motor_df = util.filter_scores(file_dir,'pre-dbs updrs','stim','pre op levadopa equivalent dose (mg)','CORNELL ID')
# Find cases with all required scores
subs,pre_imp,post_imp,pre_updrs_off,ledd = util.get_full_cases(motor_df,
                                                          'CORNELL ID',
                                                          'OFF (pre-dbs updrs)',
                                                          'ON (pre-dbs updrs)',
                                                          'pre op levadopa equivalent dose (mg)',
                                                          'OFF meds ON stim 6mo')
# Load extracted features
npy_dir = '/home/ali/RadDBS-QSM/data/npy/'
phi_dir = '/home/ali/RadDBS-QSM/data/phi/phi/'
roi_path = '/data/Ali/atlas/mcgill_pd_atlas/PD25-subcortical-labels.csv'
n_rois = 6
all_rois = False
Phi_all, X_all, R_all, K_all, ID_all = util.load_featstruct(phi_dir,npy_dir+'X/',npy_dir+'R/',npy_dir+'K/',n_rois,1595,all_rois)
ids = np.asarray(ID_all).astype(int)
# Find overlap between scored subjects and feature extraction cases
c_cases = np.intersect1d(np.asarray(case_id).astype(int),np.asarray(subs).astype(int))
# Complete case indices with respect to feature matrix
c_cases_idx = np.in1d(ids,c_cases)

X_all_c, K, R, subsc, pre_imp, pre_updrs_off, per_change, ledd = util.re_index(X_all,K_all,R_all,c_cases_idx,subs,ids,all_rois,pre_imp,pre_updrs_off,post_imp,ledd)
T = X_all_c

### Generative model
For table $T \ [=] \ m \times n$ with $m$ patients and $n$ features, want to uncover model $\Phi_{\Sigma}[\Phi^{-1}(F_i(X_i))]$ where each column $X_i$ where $i \ \in \ [1,...,n]$ in table $T$ is modeled by a Guassian copula $\Phi^{-1}(F_i(X_i))$ \
At each column $i$, a probability distribution over the feature can be modeled with covariance $\Sigma$ describing the effect of feature $i$ on feature $i'$. \
The shape of the distribution of each feature is described by the cumulative distribution function (CDF) $F_i$. Possible distributions are: 
* Gaussian $\mathcal{N}(\mu,\sigma^2)$
* Truncated Gaussian $\mathcal{N}(\mu,\sigma^2,g_{min},g_{max})$
* Uniform $\mathcal{U}(g_{min},g_{max})$
* Beta $\mathcal{B}(\alpha,\beta)$
* Exponential $\mathcal{E}(\lambda)$

The Kolmogorov-Smirnov test can test the choice of distribution fit. \
Since the covariance is biased by choice of distribution, multivariate Gaussian copulas are used to determine inter-feature effects

### Database modeling
<p align="center">
<img src="./sdv_cpa.PNG" width=1000 height=300/>
</p>

### Gaussian copula
The probability integral transform enables any sample of random variables drawn from a continuous distribution to be expressed in terms of random variables with a standard uniform distribution, $\mathcal{U}(0,1)$. A Gaussian copula arises from this transform as a distribution over the unit hypercube $[0,1]^d$ from a multivariate normal distribution $\in \ \mathbb{R}^d$. For any correlation matrix $R \ in \ [-1,1]^{d \times d}$, the Gaussian copula $C$ is

$$C_{R} = \Phi_R(\Phi^{-1}(u_1),...,\Phi^{-1}(u_d))$$

Where $\Phi_R$ is the joint cumulative distribution function of multivariate $\mathcal{N}(0,R)$ and $\Phi^{-1}$ is its inverse. The copula can be expanded: 

$$C_{R}(u)=\frac{1}{\sqrt{\mathrm{det}(R)}}e^{-\frac{1}{2}\left(\Phi^{-1}(\vec{u})^T\right)\cdot(R^{-1}-I)\left(\Phi^{-1}(\vec{u})\right)}$$

From the <i>Generative model</i> section, let $Y = \Phi^{-1}(F_i(X_i))$ so that 

$$Y = [\Phi^{-1}(F_0(X_0)) \ ... \ \Phi^{-1}(F_n(X_n))]$$

Then covariance $\Sigma$ can be computed over the transformed values in the table

### Sampling
For a table $T$ with CDFs $F$ and covariance $\Sigma$:
* Find the Cholesky decomposition $LL^T=\Sigma$ 
* Sample $v \ [=] \ n \times 1$
* Apply transformation $u=Lv$ in copula space
* Return to table space $x = [F^{-1}_0(\Phi(u_0)) \ ... \ F^{-1}_n(\Phi(u_n))]$

In [None]:
from sdv.lite import SingleTablePreset

synthesizer = SingleTablePreset(metadata, name='FAST_ML')
synthesizer.fit(data=real_data)