# Synthetic data generation in Python

### Required libraries

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import random
import numpy as np


## Load data

In [6]:
datos = pd.read_csv('./data/datos_originales.csv')

## Generating synthetic data from real data

In [1]:
from sdv import __version__ as sdv_version
sdv_version

'1.6.0'

In [2]:
from sdv.single_table.ctgan import CTGANSynthesizer
from sdv.single_table.copulagan import CopulaGANSynthesizer
from sdv.single_table.copulas import GaussianCopulaSynthesizer


In [75]:
from sdv.metadata import SingleTableMetadata

In [76]:
metadata = SingleTableMetadata()

In [77]:
metadata.detect_from_dataframe(data=datos)

In [None]:
metadata.update_column(
    column_name='has_rewards',
    sdtype='boolean'
)

In [78]:
metadata

{
    "columns": {
        "gender": {
            "sdtype": "categorical"
        },
        "birth_date": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d"
        },
        "age": {
            "sdtype": "numerical"
        },
        "name": {
            "sdtype": "unknown",
            "pii": true
        },
        "height": {
            "sdtype": "numerical"
        },
        "weight": {
            "sdtype": "numerical"
        },
        "bmi": {
            "sdtype": "numerical"
        },
        "tobacco": {
            "sdtype": "categorical"
        },
        "hypertension": {
            "sdtype": "categorical"
        },
        "diabetes": {
            "sdtype": "categorical"
        },
        "infarct": {
            "sdtype": "categorical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}

In [116]:
metadata.to_dict()

{'columns': {'gender': {'sdtype': 'categorical'},
  'birth_date': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
  'age': {'sdtype': 'numerical'},
  'name': {'sdtype': 'unknown', 'pii': True},
  'height': {'sdtype': 'numerical'},
  'weight': {'sdtype': 'numerical'},
  'bmi': {'sdtype': 'numerical'},
  'tobacco': {'sdtype': 'categorical'},
  'hypertension': {'sdtype': 'categorical'},
  'diabetes': {'sdtype': 'categorical'},
  'infarct': {'sdtype': 'categorical'}},
 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}

![](img/ctgan.png)

In [79]:
model = CTGANSynthesizer(metadata)


In [80]:
%%time
model.fit(datos)

CPU times: user 59.6 s, sys: 6.45 s, total: 1min 6s
Wall time: 17.3 s


In [86]:
synth_data = model.sample(10000)

In [82]:
synth_data

Unnamed: 0,gender,birth_date,age,name,height,weight,bmi,tobacco,hypertension,diabetes,infarct
0,M,1955-02-11,52,sdv-pii-inuqs,189.994085,74.209039,6.049697,0,1,0,0
1,M,1967-05-06,58,sdv-pii-vatdi,160.647212,84.129277,21.291555,0,0,0,1
2,F,1963-08-17,63,sdv-pii-6pw1k,196.121922,64.553854,17.074343,1,0,0,0
3,F,1972-07-30,54,sdv-pii-h6fyk,165.566769,60.713768,26.227775,0,0,0,1
4,F,1952-09-15,30,sdv-pii-sh58h,140.759413,101.181993,38.024653,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95,M,1955-09-20,60,sdv-pii-0lmu2,148.719158,73.228213,17.437547,0,1,0,0
96,F,1984-07-30,30,sdv-pii-ro9tz,186.061275,55.230197,35.294049,1,0,0,0
97,M,1978-07-03,52,sdv-pii-td6d3,196.003628,69.419186,25.832188,1,0,0,1
98,F,1980-04-17,43,sdv-pii-9zmmo,191.083033,77.689653,29.271255,0,0,0,1


In [87]:
X = synth_data[['age','bmi', 'diabetes', 'tobacco', 'hypertension']].copy()
X['gender'] = LabelEncoder().fit_transform(synth_data['gender'])
y = synth_data['infarct']

## Evaluation

![](img/evaluation.png)

### Fidelity

In [102]:
from sdmetrics.reports.single_table import QualityReport

In [103]:
report = QualityReport()

In [105]:

report.generate(datos, synth_data, metadata.to_dict())

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 11/11 [00:00<00:00, 114.08it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 55/55 [00:00<00:00, 105.13it/s]

Overall Quality Score: 86.15%

Properties:
- Column Shapes: 89.19%
- Column Pair Trends: 83.12%


In [107]:
report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,gender,TVComplement,0.9844
1,birth_date,KSComplement,0.6391
2,age,KSComplement,0.8994
3,height,KSComplement,0.833
4,weight,KSComplement,0.8356
5,bmi,KSComplement,0.9113
6,tobacco,TVComplement,0.9546
7,hypertension,TVComplement,0.9745
8,diabetes,TVComplement,0.9338
9,infarct,TVComplement,0.953


In [108]:
report.get_visualization(property_name='Column Shapes')

In [109]:
from sdmetrics.visualization import get_column_plot

In [110]:
get_column_plot(
    real_data=datos,
    synthetic_data=synth_data,
    column_name='age'
)

In [111]:
get_column_plot(
    real_data=datos,
    synthetic_data=synth_data,
    column_name='height'
)

In [114]:
get_column_plot(
    real_data=datos,
    synthetic_data=synth_data,
    column_name='gender'
)

In [121]:
get_column_plot(
    real_data=datos,
    synthetic_data=synth_data,
    column_name='infarct',
    plot_type='bar'
)

In [115]:
report.get_visualization(property_name='Column Pair Trends')

### Utility

In [91]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
clf = xgb.XGBClassifier()

# clf = LogisticRegression(max_iter=1000)

# Train the classifier
clf.fit(X_train, y_train)

In [92]:
X_eval = datos[['age','bmi', 'diabetes', 'tobacco', 'hypertension']].copy()
X_eval['gender'] = LabelEncoder().fit_transform(datos['gender'])
y_eval = datos['infarct']

In [93]:
# Predict on the validation set
y_pred = clf.predict(X_eval)

print(classification_report(y_eval, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       598
           1       0.94      0.92      0.93       402

    accuracy                           0.94      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000



In [95]:
print(confusion_matrix(y_eval, y_pred))

[[574  24]
 [ 34 368]]
