# Step 5: Measure Population Fidelity (PF)

In [1]:
import time
start_time = time.perf_counter()

In [2]:
import pandas as pd 
import sys
import os

sys.path.append('../src')
from PF_metrics import compute_all_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [3]:
pf_measure_columns = [
    'DatasetName',
    'pMSE',
    'pMSE_time',
    's_pMSE',
    's_pMSE_time',
    'Cluster_1',   # num of clusters = 1% of dataset_size
    'Cluster_1_time',   # num of clusters = 1% of dataset_size
    #'Cluster_5',     # num of clusters = 5% of dataset_size
    #'Cluster_10',    # num of clusters = 10% of dataset_size
    'BNLogLikelihood',
    'BNLogLikelihood_time',
    'GMLogLikelihood',
    'GMLogLikelihood_time',
    'ContinousKLDivergence',
    'ContinousKLDivergence_time',
    'DiscreteKLDivergence',
    'DiscreteKLDivergence_time',
    'KSComplement',
    'KSComplement_time',
    'CSTest',
    'CSTest_time',
    'CrossClassification', #Cross-classification 
    'CrossClassification_time', #Cross-classification 
]


result_df = pd.DataFrame(columns=pf_measure_columns)

for dataset_setting in settings:
    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'])
    
    metadata = dataset_setting['meta']['meta_data']
    original_data_id = dataset_setting['meta']['id']
    
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    
    
    for sd_filename in synthetic_datasets:
        
        sd_id = os.path.splitext(sd_filename)[0]
        sd_path = folders['sd_dir']+sd_filename

        synthetic_data = pd.read_csv(sd_path)
        
        pf_measures = compute_all_pf_measures(original_data=original_data,
                                              synthetic_data=synthetic_data,
                                              metadata=metadata,
                                              SD_id=sd_id)
        
        result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)


# save the results
result_df.to_csv('../data/pf_measures.csv', index=False)
result_df.to_html('../data/pf_measures.html')

num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klus

In [4]:
display(result_df)

Unnamed: 0,DatasetName,pMSE,pMSE_time,s_pMSE,s_pMSE_time,Cluster_1,Cluster_1_time,BNLogLikelihood,BNLogLikelihood_time,GMLogLikelihood,...,ContinousKLDivergence,ContinousKLDivergence_time,DiscreteKLDivergence,DiscreteKLDivergence_time,KSComplement,KSComplement_time,CSTest,CSTest_time,CrossClassification,CrossClassification_time
0,SD0Q1_0,0.249927,0.028653,225.776221,0.013962,25.6,10.75115,-0.736932,0.502553,-62.8464,...,0.184296,0.09691,,0.001946,0.136393,0.006427,0.761715,0.001195,0.0,0.087358
1,SD0Q1_1,0.249549,0.019096,221.866817,0.016795,25.6,13.936637,-0.769412,0.024554,-60.419391,...,0.184229,0.099715,,0.000342,0.136393,0.007066,0.679973,0.000903,0.403698,0.068578
2,SD0Q1_2,0.249457,0.016157,226.841694,0.015787,25.6,15.605834,-0.747488,0.024622,-63.34854,...,0.184526,0.095951,,0.000354,0.136556,0.007219,0.734804,0.000886,0.286533,0.075431
3,SD0Q1_3,0.249801,0.016735,253.190281,0.01635,25.6,13.515383,-0.7686,0.024662,-65.448432,...,0.184633,0.100639,,0.000281,0.136393,0.005073,0.681976,0.000703,0.47619,0.056105
4,SD0Q1_4,0.249818,0.019098,224.994039,0.01798,25.6,13.039091,-0.753984,0.02452,-61.353128,...,0.184222,0.093245,,0.000386,0.136393,0.007095,0.718401,0.000921,0.120419,0.069686
5,SD0Q1_5,0.249597,0.017661,235.161657,0.013822,25.6,13.670153,-0.755608,0.024559,-63.308264,...,0.184494,0.095318,,0.000293,0.13623,0.005058,0.71432,0.000699,0.512124,0.051652
6,SD0Q1_6,0.24942,0.019098,232.00275,0.016541,25.6,10.715663,-0.749112,0.024655,-62.089629,...,0.184214,0.098317,,0.000355,0.136556,0.00606,0.730692,0.000833,0.201342,0.072865
7,SD0Q1_7,0.249693,0.031276,222.012304,0.028304,25.6,13.985319,-0.745864,0.02468,-62.808828,...,0.184301,0.136854,,0.000726,0.13623,0.014229,0.738924,0.001738,0.398818,0.062978
8,SD0Q1_8,0.24995,0.035348,235.063663,0.025656,24.742857,14.238543,-0.746676,0.026588,-62.81431,...,0.184577,0.144553,,0.000731,0.13623,0.015674,0.736863,0.001772,0.437613,0.107785
9,SD0Q1_9,0.249929,0.031425,233.06384,0.027286,25.6,11.808695,-0.75642,0.035674,-63.064345,...,0.184566,0.149593,,0.000612,0.136393,0.011668,0.712283,0.001838,0.509849,0.131089


In [5]:
end_time = time.perf_counter()

print(f"Notebook took {int(round(end_time-start_time), 0)} seconds.")
print(f"Notebook took {int(round( (end_time-start_time)/60, 1))} minutes.")

TypeError: int() can't convert non-string with explicit base

#### TODO:
Need to fix some error with applying higher number of groups of clusters to the cluster analysis metric

In [None]:
#from PF_metrics import *
#clus = cluster_metric(pd.read_csv('../data/real/diabetes.csv'), 
#                      synthetic_data, 
#                      num_clusters=69, 
#                      metadata=settings[0]['meta']['meta_data'])

In [None]:
#display(clus)