# Step 5: Measure Population Fidelity (PF)

In [None]:
import time
start_time = time.perf_counter()

In [1]:
import pandas as pd 
import sys
import os

sys.path.append('../src')
from PF_metrics import compute_all_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [2]:
pf_measure_columns = [
    'DatasetName',
    'pMSE',
    'pMSE_time',
    's_pMSE',
    's_pMSE_time',
    'Cluster_1',   # num of clusters = 1% of dataset_size
    'Cluster_1_time',   # num of clusters = 1% of dataset_size
    #'Cluster_5',     # num of clusters = 5% of dataset_size
    #'Cluster_10',    # num of clusters = 10% of dataset_size
    'BNLogLikelihood',
    'BNLogLikelihood_time',
    'GMLogLikelihood',
    'GMLogLikelihood_time',
    'ContinousKLDivergence',
    'ContinousKLDivergence_time',
    'DiscreteKLDivergence',
    'DiscreteKLDivergence_time',
    'KSComplement',
    'KSComplement_time',
    'CSTest',
    'CSTest_time',
    'CrossClassification', #Cross-classification 
    'CrossClassification_time', #Cross-classification 
]


result_df = pd.DataFrame(columns=pf_measure_columns)

for dataset_setting in settings:
    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'])
    
    metadata = dataset_setting['meta']['meta_data']
    original_data_id = dataset_setting['meta']['id']
    
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    
    
    for sd_filename in synthetic_datasets:
        
        sd_id = os.path.splitext(sd_filename)[0]
        sd_path = folders['sd_dir']+sd_filename

        synthetic_data = pd.read_csv(sd_path)
        
        pf_measures = compute_all_pf_measures(original_data=original_data,
                                              synthetic_data=synthetic_data,
                                              metadata=metadata,
                                              SD_id=sd_id)
        
        result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)


# save the results
result_df.to_csv('../data/pf_measures.csv', index=False)
result_df.to_html('../data/pf_measures.html')

num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>


In [3]:
display(result_df)

Unnamed: 0,DatasetName,pMSE,pMSE_time,s_pMSE,s_pMSE_time,Cluster_1,Cluster_1_time,BNLogLikelihood,BNLogLikelihood_time,GMLogLikelihood,...,ContinousKLDivergence,ContinousKLDivergence_time,DiscreteKLDivergence,DiscreteKLDivergence_time,KSComplement,KSComplement_time,CSTest,CSTest_time,CrossClassification,CrossClassification_time
0,SD0Q1_0,0.233848,0.019624,216.951314,0.019515,22.348661,10.580234,-0.751548,0.544071,-30.322911,...,0.377548,0.099772,,0.00231,0.534993,0.006584,0.724537,0.00152,0.449519,0.094954
1,SD0Q1_1,0.229169,0.021942,204.970242,0.018412,21.393094,11.583557,-0.730436,0.033245,-31.519928,...,0.321741,0.094214,,0.00035,0.522461,0.006774,0.778423,0.000959,0.472469,0.090028
2,SD0Q1_2,0.22515,0.020491,225.208145,0.019281,22.427782,11.315736,-0.712572,0.026845,-30.401685,...,0.330569,0.093321,,0.000358,0.515625,0.005439,0.824878,0.000762,0.488506,0.086342
3,SD0Q2_0,0.029484,0.020964,32.623956,0.017239,2.116039,10.718561,-0.757232,0.025507,-30.359353,...,0.62758,0.092039,,0.000288,0.849447,0.004909,0.710248,0.000797,0.68175,0.089628
4,SD0Q3_0,0.024256,0.018657,21.031652,0.020525,1.674979,11.409437,-0.751548,0.027211,-27.923858,...,0.732588,0.097273,,0.00042,0.879069,0.006368,0.724537,0.000945,0.588694,0.097403
5,SD0Q4_0,0.017889,0.027624,17.219398,0.021002,1.071249,13.401431,-0.75236,0.026402,-31.621285,...,0.676394,0.098552,,0.000332,0.881348,0.004861,0.72249,0.000827,0.673469,0.094189


In [6]:
end_time = time.perf_counter()

print(f"Notebook took {int(round(end_time-start_time), 0)} seconds.")
print(f"Notebook took {int(round( (end_time-start_time)/60, 1))} minutes.")

NameError: name 'start_time' is not defined

#### TODO:
Need to fix some error with applying higher number of groups of clusters to the cluster analysis metric

In [None]:
from PF_metrics import *
clus = cluster_metric(pd.read_csv('../data/real/diabetes.csv'), 
                      synthetic_data, 
                      num_clusters=69, 
                      metadata=settings[0]['meta']['meta_data'])

In [None]:
display(clus)