# Step 5: Measure Population Fidelity (PF)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from math import sqrt
import sys

sys.path.append('../src')
from PF_metrics import *
from utils import *

In [2]:
config = getExperimentConfig()
settings = getPicklesFromDir(config['folders']['settings_dir'])
display(settings)

[{'meta': {'name': 'Diabetes',
   'id': 'D0',
   'filename': 'diabetes.csv',
   'target': 'Outcome',
   'ordinal_features': None,
   'numeric_features': ['DiabetesPedigreeFunction',
    'BMI',
    'Insulin',
    'Glucose',
    'Age',
    'SkinThickness',
    'BloodPressure',
    'Pregnancies'],
   'text_features': None,
   'categorical_features': None,
   'sd_meta_list': [{'id': 'SD0Q1_0',
     'path': '../data/synthetic/SD0Q1_0.csv',
     'sdg_params': {'epochs': 10, 'batch_size': 6, 'pac': 2, 'verbose': True}},
    {'id': 'SD0Q2_0',
     'path': '../data/synthetic/SD0Q2_0.csv',
     'sdg_params': {'epochs': 300,
      'batch_size': 50,
      'pac': 10,
      'verbose': True}}]},
  'setup_param': {'target': 'Outcome',
   'train_size': 0.8,
   'fold_strategy': 'stratifiedkfold',
   'fold': 10,
   'ordinal_features': None,
   'numeric_features': ['DiabetesPedigreeFunction',
    'BMI',
    'Insulin',
    'Glucose',
    'Age',
    'SkinThickness',
    'BloodPressure',
    'Pregnancies'],


In [3]:
original_data = pd.read_csv("../data/real/diabetes.csv")
sd0q1 = pd.read_csv("../data/synthetic/SD0Q1_0.csv")
sd0q2 = pd.read_csv("../data/synthetic/SD0Q2_0.csv")
display(sd0q1)
display(sd0q2)

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,0,149,81,37,317,35.8,1.226,21,0
1,1,0,141,96,57,146,50.1,1.065,49,0
2,2,0,107,82,54,28,37.2,1.108,43,0
3,3,0,156,110,24,22,23.1,1.948,81,1
4,4,16,177,91,26,23,27.5,0.880,28,1
...,...,...,...,...,...,...,...,...,...,...
763,763,0,86,81,23,27,31.3,1.133,45,0
764,764,0,152,95,22,16,34.8,1.002,55,1
765,765,3,171,64,23,397,36.0,1.085,21,1
766,766,0,91,47,59,161,19.1,1.150,25,0


Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,5,166,6,19,202,16.0,0.293,26,1
1,1,5,122,63,20,0,36.4,0.334,35,0
2,2,12,121,89,21,0,42.5,0.503,30,1
3,3,2,101,73,1,150,18.5,0.117,30,0
4,4,1,115,38,49,0,38.1,0.659,31,0
...,...,...,...,...,...,...,...,...,...,...
763,763,3,119,66,6,137,26.7,0.078,21,0
764,764,13,154,101,0,11,33.7,0.109,46,1
765,765,7,149,66,12,98,36.7,0.339,22,1
766,766,3,151,83,5,133,37.2,0.617,43,0


In [4]:
sd0q1.drop(columns='Unnamed: 0', inplace=True)
display(sd0q1.head())
sd0q2.drop(columns='Unnamed: 0', inplace=True)
display(sd0q2.head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,149,81,37,317,35.8,1.226,21,0
1,0,141,96,57,146,50.1,1.065,49,0
2,0,107,82,54,28,37.2,1.108,43,0
3,0,156,110,24,22,23.1,1.948,81,1
4,16,177,91,26,23,27.5,0.88,28,1


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,5,166,6,19,202,16.0,0.293,26,1
1,5,122,63,20,0,36.4,0.334,35,0
2,12,121,89,21,0,42.5,0.503,30,1
3,2,101,73,1,150,18.5,0.117,30,0
4,1,115,38,49,0,38.1,0.659,31,0


In [5]:
spmse1 = S_pMSE(original_data, sd0q1)
spmse2 = S_pMSE(original_data, sd0q2)
print(f"S_pMSE: SD0Q1: {spmse1}, SD0Q2: {spmse2}" )

S_pMSE: SD0Q1: 228.49999999999997, SD0Q2: 228.49999999999997


In [7]:
r1 = pMSE(original_data, sd0q1)
r2 = pMSE(original_data, sd0q2)
print(f"pMSE: SD0Q1: {r1}, SD0Q2: {r2}" )

pMSE: SD0Q1: 0.07503255208333333, SD0Q2: 0.07503255208333333


In [19]:
from math import log
n_clusters = 25
categorical_indecies = [8]
c1 = cluster_analysis_metric(original_data=original_data, 
                             synthetic_data=sd0q1, 
                             num_clusters=n_clusters, 
                             categorical_columns=categorical_indecies)
c2 = cluster_analysis_metric(original_data=original_data, 
                             synthetic_data=sd0q2, 
                             num_clusters=n_clusters, 
                             categorical_columns=categorical_indecies)

print(f"Cluster analysis metric: SD0Q1: {c1}, SD0Q2: {c2}" )
print(f"Log cluster analysis metric: SD0Q1: {log(c1)}, SD0Q2: {log(c2)}" )

Cluster analysis metric: SD0Q1: 0.0008555175811653243, SD0Q2: 0.0003068494580357751
Log cluster analysis metric: SD0Q1: -7.06380391409471, SD0Q2: -8.089153295349483


In [9]:
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport
q1Report = QualityReport()
q2Report = QualityReport()

d1Report = DiagnosticReport()
d2Report = DiagnosticReport()
fields = settings[0]['sdg_param']['field_types']
metadata = { 'fields': fields}

display("SD0Q1")
d1Report.generate(original_data, sd0q1, metadata)
q1Report.generate(original_data, sd0q1, metadata)

display("SD0Q2")
d2Report.generate(original_data, sd0q2, metadata)
q2Report.generate(original_data, sd0q2, metadata)


'SD0Q1'

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.40s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 32.52it/s]



Overall Quality Score: 75.09%

Properties:
Column Shapes: 65.47%
Column Pair Trends: 84.71%


'SD0Q2'

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.32s/it]



DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 46.31it/s]



Overall Quality Score: 87.98%

Properties:
Column Shapes: 84.97%
Column Pair Trends: 90.99%


In [11]:
display("SD0Q1")
d1fig = d1Report.get_visualization(property_name='Coverage')
d1fig.show()
q1fig = q1Report.get_visualization(property_name='Column Shapes')
q1fig.show()

'SD0Q1'

In [12]:
display("SD0Q2")
d2fig = d2Report.get_visualization(property_name='Coverage')
d2fig.show()
q2fig = q2Report.get_visualization(property_name='Column Shapes')
q2fig.show()

'SD0Q2'

In [13]:
from sdmetrics.reports import utils

fig = utils.get_column_pair_plot(
    real_data=original_data,
    synthetic_data=sd0q1,
    column_names= ['Glucose', 'Age'],
    metadata=metadata
    
)

fig.show()

In [14]:
from sdmetrics.single_table import BNLikelihood, BNLogLikelihood, GMLogLikelihood

bn1 = GMLogLikelihood.compute(real_data=original_data, synthetic_data=sd0q1)
bn2 = GMLogLikelihood.compute(real_data=original_data, synthetic_data=sd0q2)

display(f"bn1: {bn1}, bn3: {bn2}")


'bn1: -31.91811711806896, bn3: -25.412182677011668'