# Synthetic data generation

In [None]:
import os

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.rcParams.update({'text.usetex': True,
                     'text.latex.preamble': r'\usepackage{amsmath}',
                     'font.family': 'serif'})
import numpy as np
np.random.seed(0)
import pandas as pd
from sdv.constraints import Inequality, ScalarRange
from sdv.tabular import GaussianCopula, CTGAN
from copulas.univariate import UniformUnivariate, GaussianKDE
import seaborn as sns
sns.set(style='ticks', font='serif')

In [None]:
PATH_PROC_DATA = os.path.join(os.pardir, 'data', 'processed')
clean_data = pd.read_csv(os.path.join(PATH_PROC_DATA, 'pDeltaT_clean.csv'))

In [None]:
n_tot_rel_1 = Inequality(low_column_name='psPDn_1 [W/m2]',
                         high_column_name='psPDtot_1 [W/m2]')
n_tot_rel_4 = Inequality(low_column_name='psPDn_4 [W/m2]',
                         high_column_name='psPDtot_4 [W/m2]')
table_metadata = {'fields': {'d [mm]': {'type': 'numerical', 'subtype': 'float'},
                             'f [GHz]': {'type': 'numerical', 'subtype': 'float'},
                             'pPDn [W/m2]': {'type': 'numerical', 'subtype': 'float'},
                             'pPDtot [W/m2]': {'type': 'numerical', 'subtype': 'float'},
                             'psPDn_1 [W/m2]': {'type': 'numerical', 'subtype': 'float'},
                             'psPDtot_1 [W/m2]': {'type': 'numerical', 'subtype': 'float'},
                             'psPDn_4 [W/m2]': {'type': 'numerical', 'subtype': 'float'},
                             'psPDtot_4 [W/m2]': {'type': 'numerical', 'subtype': 'float'},
                             'pDeltaT * 100 [°C]': {'type': 'numerical', 'subtype': 'float'}},
                  'constraints': [n_tot_rel_1, n_tot_rel_4]}
field_distributions = {'d [mm]': GaussianKDE,
                       'f [GHz]': GaussianKDE,
                       'pPDn [W/m2]': GaussianKDE,
                       'pPDtot [W/m2]': GaussianKDE,
                       'psPDn_1 [W/m2]': GaussianKDE,
                       'psPDtot_1 [W/m2]': GaussianKDE,
                       'psPDn_4 [W/m2]': GaussianKDE,
                       'psPDtot_4 [W/m2]': GaussianKDE,
                       'pDeltaT * 100 [°C]': GaussianKDE}

In [None]:
try:
    syn_data = pd.read_csv(os.path.join(PATH_PROC_DATA, 'pDeltaT_synthetic.csv'))
except Exception as e:
    print(e, '\nGenerating synthetic data...')
    copula = GaussianCopula(table_metadata=table_metadata,
                            field_distributions=field_distributions,
                            learn_rounding_scheme=False)
    copula.fit(clean_data)
    syn_data = copula.sample(num_rows=10_000)
    syn_data.to_csv(os.path.join(PATH_PROC_DATA, 'pDeltaT_synthetic.csv'),
                    index=False)

## Quality report

In [None]:
from sdmetrics.reports.single_table import QualityReport

In [None]:
report = QualityReport()
report.generate(real_data=clean_data.drop(columns='pDeltaT [°C]'),
                synthetic_data=syn_data,
                metadata=table_metadata)

### Column shapes

The similarity of a real column vs. a synthetic column in terms of the column shapes - the marginal distribution of that column - by using the Kolmogorov-Smirnov (KS) test statistics. To compute this statistic, marginal distributions over real and synthetic data are converted into corresponding cumulative distribution functions. The KS statistic quantifies a distance between the empirical distribution functions. Here, the quality score is reported as 1 - KS statistic so that a higher score means higher quality.

In [None]:
col_shapes = report.get_details(property_name='Column Shapes')
display(col_shapes)

In [None]:
cs = sns.color_palette('rocket', 2)
ps = ['00', '01', '10', '11']
xs = ['psPDn_4 [W/m2]',
      'psPDtot_4 [W/m2]',
      'psPDn_1 [W/m2]',
      'psPDtot_1 [W/m2]']
ls = [r'$psPD_{\text{n}, 4}$',
      r'$psPD_{\text{tot}, 4}$',
      r'$psPD_{\text{n}, 1}$',
      r'$psPD_{\text{tot}, 1}$']

fig, axs = plt.subplots(2, 2, sharex=False, sharey=False, figsize=(4.5, 4))
for i, x in enumerate(xs):
    irow, icol = ps[i]
    irow, icol = int(irow), int(icol)
    
    axs[irow, icol] = sns.histplot(data=clean_data, x=x,
                                   color=cs[0],
                                   stat='density', kde=True,
                                   label='true', ax=axs[irow, icol],
                                   line_kws={'ls': '-', 'lw': 2})
    axs[irow, icol] = sns.histplot(data=syn_data, x=x,
                                   color=cs[1],
                                   stat='density', kde=True,
                                   label='synthetic', ax=axs[irow, icol],
                                   line_kws={'ls': '--', 'lw': 2})
    axs[irow, icol].set(title=ls[i], xlabel='', ylabel='')

fig.supxlabel(r'incident power density [W/m$^2$]')
fig.supylabel(r'probability density')
fig.suptitle('$\\mathbf{(a)}$', x=0.08, y=0.92)
fig.tight_layout()
sns.despine()

In [None]:
cs = sns.color_palette('rocket', 2)

fig, axs = plt.subplots(2, 1, sharex=True, figsize=(4.5, 4))
axs[0] = sns.histplot(data=clean_data, x='pDeltaT * 100 [°C]',
                      color=cs[0], stat='density', kde=True, ax=axs[0],
                      line_kws={'ls': '-', 'lw': 2})
axs[0] = sns.histplot(data=syn_data, x='pDeltaT * 100 [°C]',
                      color=cs[1], stat='density', kde=True, ax=axs[0],
                      line_kws={'ls': '-', 'lw': 2})
axs[0].set(title='probability density function',
           xlabel='', ylabel='',
           xticks=[0, 60, 120],
           xticklabels=[0, 0.6, 1.2],
           xlim=[0, 120],
           yticks=[0, 0.025, 0.05],
           yticklabels=[0, 0.025, 0.05],
           ylim=[0, 0.05])
axs[1] = sns.kdeplot(data=clean_data, x='pDeltaT * 100 [°C]',
                     cumulative=True, color=cs[0], ls='-', lw=2,
                     label='true', ax=axs[1])
axs[1] = sns.kdeplot(data=syn_data, x='pDeltaT * 100 [°C]',
                     cumulative=True, color=cs[1], ls='--', lw=2,
                     label='synthetic', ax=axs[1])
axs[1].set(title='cumulative density function',
           xlabel='', ylabel='',
           xticks=[0, 60, 120],
           xticklabels=[0, 0.6, 1.2],
           xlim=[0, 120],
           yticks=[0, 0.5, 1],
           yticklabels=[0, 0.5, 1],
           ylim=[0, 1])

fig.supxlabel(r'$\Delta T_\text{max}$ [°C]')
fig.supylabel(r'probability density')
fig.suptitle('$\\mathbf{(b)}$', x=0.08, y=0.92)
handles, labels = fig.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
fig.legend(by_label.values(), by_label.keys(),
           title='data',
           bbox_to_anchor=(1.25, 0.615))
fig.tight_layout()
sns.despine()

### Column pair trends

For a pair of columns, the column pair trend is computed as a Pearson correlation coefficient (assuming linear dependence) separateljy on real and synthetic data.
The correlation similarity reported as quality score is computed as the normalized relative difference between two separate correlation coefficients.

In [None]:
col_pair_trends = report.get_details(property_name='Column Pair Trends')
pDeltaT_pair_trend = col_pair_trends[col_pair_trends['Column 1'] == 'pDeltaT * 100 [°C]']
display(pDeltaT_pair_trend)

In [None]:
report.get_visualization(property_name='Column Pair Trends')

## Regression tasks

Regression metrics calculate the success of using synthetic data to perform an ML regression task by using linear regression and multi-layer perceptron neural network. It first trains the ML algorithm using the training data (usually synthetic data). The output is an ML model that can predict the value of a given target column. Then, it tests the ML model by making predictions on the testing data (usually real data) and compares against the actual values. It finally returns the Train-Synthetic-Test-Real (TSTR) score by means of the coefficient of determination (R2).

In [None]:
from sdmetrics.single_table import LinearRegression

In [None]:
LinearRegression.compute(test_data=clean_data.drop(columns='pDeltaT [°C]'),
                         train_data=syn_data,
                         target='pDeltaT * 100 [°C]',
                         metadata=table_metadata)

## Diagnostic report

Few basic checks on the synthetic data to give a general sense of the strengths and weakness of the synthetic data generation  basic checks on your synthetic data to give a general sense of the strengths and weakness of your synthetic data model.  basic checks on your synthetic data to give a general sense of the strengths and weakness of your synthetic data model.  basic checks on your synthetic data to give a general sense of the strengths and weakness of your synthetic data model. model. 

In [None]:
from sdmetrics.reports.single_table import DiagnosticReport

In [None]:
report = DiagnosticReport()
report.generate(real_data=clean_data.drop(columns='pDeltaT [°C]'),
                synthetic_data=syn_data,
                metadata=table_metadata)

In [None]:
report.get_properties()

In [None]:
# Does the synthetic data cover the range of possible values?
report.get_details(property_name='Coverage')

In [None]:
# Is the synthetic data unique or does it copy the real rows?
report.get_details(property_name='Synthesis')

In [None]:
# Does the synthetic data respect the boundaries set by the real data?
report.get_details(property_name='Boundaries')  # score is the percentage of points within boundaries