# Step 5: Measure Population Fidelity (PF)

In [1]:
import pandas as pd 
import sys
import os
import re

from sdv.metadata import SingleTableMetadata

sys.path.append('../src')
from PF_metrics import compute_all_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [2]:
pf_measure_columns = [
    'DatasetName',
    'pMSE',
    'pMSE_time',
    's_pMSE',
    's_pMSE_time',
    'Cluster_1',   # num of clusters = 1% of dataset_size
    'Cluster_1_time',   # num of clusters = 1% of dataset_size
    #'Cluster_5',     # num of clusters = 5% of dataset_size
    #'Cluster_10',    # num of clusters = 10% of dataset_size
    'BNLogLikelihood',
    'BNLogLikelihood_time',
    'GMLogLikelihood',
    'GMLogLikelihood_time',
    'ContinousKLDivergence',
    'ContinousKLDivergence_time',
    'DiscreteKLDivergence',
    'DiscreteKLDivergence_time',
    'KSComplement',
    'KSComplement_time',
    'CSTest',
    'CSTest_time',
    'CrossClassification', #Cross-classification 
    'CrossClassification_time', #Cross-classification 
]


result_df = pd.DataFrame(columns=pf_measure_columns)

run_dataset = config['run_dataset']

for dataset_setting in settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        continue
        
    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'])
    
    metadata = SingleTableMetadata().load_from_json(dataset_setting['meta']['meta_filepath']).to_dict()
 
    original_data_id = dataset_setting['meta']['id']
    
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    
    
    for sd_filename in synthetic_datasets:
        
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename

        synthetic_data = pd.read_csv(sd_path)
        
        pf_measures = compute_all_pf_measures(original_data=original_data,
                                              synthetic_data=synthetic_data,
                                              metadata=metadata,
                                              SD_id=sd_id)
        pf_measures['Quality'] = quality
        pf_measures['Original dataset'] = original_data_id
        
        result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)


# save the results
result_df.to_csv(f"{folders['result_dir']}pf_measures.csv", index=False)
result_df.to_html(f"{folders['result_dir']}pf_measures.html")

num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klus

KeyboardInterrupt: 

In [5]:
display(result_df)

Unnamed: 0,DatasetName,pMSE,pMSE_time,s_pMSE,s_pMSE_time,Cluster_1,Cluster_1_time,BNLogLikelihood,BNLogLikelihood_time,GMLogLikelihood,...,DiscreteKLDivergence,DiscreteKLDivergence_time,KSComplement,KSComplement_time,CSTest,CSTest_time,CrossClassification,CrossClassification_time,SDG,SDG quality
0,SD0Q1_0,0.248273,0.056337,232.866877,0.054615,25.406122,15.027726,-0.758044,0.492286,-61.707098,...,,0.001409,0.250163,0.00573,0.708214,0.001575,0.098765,0.108728,SD0Q1,Q1
1,SD0Q1_1,0.246914,0.033768,234.404602,0.035985,25.468817,12.421351,-0.758856,0.031313,-57.726724,...,,0.00054,0.25179,0.006045,0.706183,0.000811,0.326733,0.115966,SD0Q1,Q1
2,SD0Q1_2,0.247955,0.030296,212.381672,0.028375,25.536842,10.228261,-0.751548,0.043582,-57.788814,...,,0.000435,0.252441,0.008856,0.724537,0.001301,0.432348,0.162061,SD0Q1,Q1
3,SD0Q1_3,0.248768,0.047661,226.776383,0.047764,25.467875,15.754973,-0.758856,0.046215,-61.715645,...,,0.000633,0.252604,0.011648,0.706183,0.002533,0.42042,0.180195,SD0Q1,Q1
4,SD0Q1_4,0.247236,0.038869,226.93311,0.058751,25.6,13.441719,-0.741804,0.035264,-62.994314,...,,0.00028,0.254232,0.00535,0.749257,0.00078,0.517874,0.126238,SD0Q1,Q1
5,SD0Q1_5,0.247514,0.031184,213.265524,0.03195,25.6,14.406709,-0.75236,0.041805,-59.776162,...,,0.000304,0.254557,0.006128,0.72249,0.000885,0.382536,0.104,SD0Q1,Q1
6,SD0Q1_6,0.249083,0.032825,235.569252,0.031535,25.6,11.891671,-0.747488,0.027628,-59.139625,...,,0.001591,0.252767,0.021692,0.734804,0.001901,0.435268,0.234832,SD0Q1,Q1
7,SD0Q1_7,0.248858,0.064466,234.438745,0.049726,25.533875,17.275857,-0.758044,0.065079,-65.120477,...,,0.000692,0.255371,0.006094,0.708214,0.004542,0.306905,0.140082,SD0Q1,Q1
8,SD0Q1_8,0.249747,0.045347,234.600989,0.035879,25.6,11.724662,-0.73612,0.031494,-57.535924,...,,0.000525,0.257487,0.010056,0.763798,0.000872,0.427273,0.113367,SD0Q1,Q1
9,SD0Q1_9,0.249079,0.043127,229.512475,0.037425,25.405979,11.245853,-0.736932,0.029837,-60.555896,...,,0.000304,0.250488,0.006301,0.761715,0.000828,0.431008,0.100314,SD0Q1,Q1


#### TODO:
Need to fix some error with applying higher number of groups of clusters to the cluster analysis metric

In [None]:
#from PF_metrics import *
#clus = cluster_metric(pd.read_csv('../data/real/diabetes.csv'), 
#                      synthetic_data, 
#                      num_clusters=69, 
#                      metadata=settings[0]['meta']['meta_data'])

In [None]:
#display(clus)