# Step 5: Measure Population Fidelity (PF)

In [2]:
import pandas as pd 
import sys
import os
import re

sys.path.append('../src')
from PF_metrics import compute_all_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [3]:
pf_measure_columns = [
    'DatasetName',
    'pMSE',
    'pMSE_time',
    's_pMSE',
    's_pMSE_time',
    'Cluster_1',   # num of clusters = 1% of dataset_size
    'Cluster_1_time',   # num of clusters = 1% of dataset_size
    #'Cluster_5',     # num of clusters = 5% of dataset_size
    #'Cluster_10',    # num of clusters = 10% of dataset_size
    'BNLogLikelihood',
    'BNLogLikelihood_time',
    'GMLogLikelihood',
    'GMLogLikelihood_time',
    'ContinousKLDivergence',
    'ContinousKLDivergence_time',
    'DiscreteKLDivergence',
    'DiscreteKLDivergence_time',
    'KSComplement',
    'KSComplement_time',
    'CSTest',
    'CSTest_time',
    'CrossClassification', #Cross-classification 
    'CrossClassification_time', #Cross-classification 
]


result_df = pd.DataFrame(columns=pf_measure_columns)

run_dataset = config['run_dataset']

for dataset_setting in settings:
        
    if run_dataset is not None and settings['meta']['id'] not in run_dataset:
        continue
    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'])
    
    metadata = dataset_setting['meta']['meta_data']
    original_data_id = dataset_setting['meta']['id']
    
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    
    
    for sd_filename in synthetic_datasets:
        
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename

        synthetic_data = pd.read_csv(sd_path)
        
        pf_measures = compute_all_pf_measures(original_data=original_data,
                                              synthetic_data=synthetic_data,
                                              metadata=metadata,
                                              SD_id=sd_id)
        pf_measures['SDG quality'] = quality
        
        result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)


# save the results
result_df.to_csv('../data/pf_measures.csv', index=False)
result_df.to_html('../data/pf_measures.html')

num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klus

In [4]:
display(result_df)

Unnamed: 0,DatasetName,pMSE,pMSE_time,s_pMSE,s_pMSE_time,Cluster_1,Cluster_1_time,BNLogLikelihood,BNLogLikelihood_time,GMLogLikelihood,...,ContinousKLDivergence,ContinousKLDivergence_time,DiscreteKLDivergence,DiscreteKLDivergence_time,KSComplement,KSComplement_time,CSTest,CSTest_time,CrossClassification,CrossClassification_time
0,SD0Q1_0,0.248734,0.026598,243.512742,0.016964,25.156807,11.130222,-0.825441,0.524548,-24.292423,...,0.137117,0.098341,,0.001627,0.138835,0.007075,0.547844,0.001224,0.293963,0.087989
1,SD0Q1_1,0.248977,0.018835,226.93418,0.017356,25.535484,12.019972,-0.815697,0.032367,-28.818502,...,0.136296,0.175011,,0.000585,0.135417,0.009772,0.569891,0.002111,0.075342,0.159874
2,SD0Q1_2,0.249149,0.02854,232.64878,0.027276,25.264,13.080206,-0.809201,0.026819,-27.971312,...,0.137018,0.171415,,0.000677,0.137044,0.013482,0.58482,0.001729,0.176039,0.152133
3,SD0Q1_3,0.249526,0.027186,228.850157,0.027291,25.181215,11.642716,-0.817321,0.035421,-28.160542,...,0.136637,0.156167,,0.000629,0.136393,0.00987,0.566187,0.00126,0.409594,0.121296
4,SD0Q1_4,0.248885,0.022893,243.76553,0.02416,25.533476,11.306775,-0.810013,0.025695,-28.117269,...,0.136496,0.152076,,0.000702,0.135742,0.012691,0.582944,0.00119,0.240896,0.125813
5,SD0Q1_5,0.244674,0.038171,223.138911,0.024147,25.245977,14.562383,-0.839245,0.028434,-24.154412,...,0.136474,0.138778,,0.000559,0.135905,0.010243,0.517351,0.001484,0.438413,0.148592
6,SD0Q1_6,0.248286,0.030794,227.986516,0.026802,25.281818,12.936501,-0.813261,0.026019,-25.065969,...,0.136678,0.169127,,0.000437,0.136556,0.011772,0.575468,0.00137,0.395522,0.143262
7,SD0Q1_7,0.249845,0.035323,233.859223,0.028056,25.6,14.277716,-0.809201,0.028998,-24.152676,...,0.136678,0.122955,,0.000705,0.136393,0.007829,0.58482,0.000969,0.409091,0.09877
8,SD0Q1_8,0.24642,0.023289,229.308991,0.026347,25.16774,12.677046,-0.818945,0.027724,-34.385644,...,0.137237,0.142066,,0.000537,0.138672,0.012313,0.562495,0.001772,0.490452,0.145868
9,SD0Q1_9,0.249463,0.027842,241.487273,0.024687,25.224074,11.707662,-0.836809,0.032138,-24.571895,...,0.136896,0.18351,,0.000687,0.137533,0.014299,0.522667,0.002405,0.204334,0.142155


#### TODO:
Need to fix some error with applying higher number of groups of clusters to the cluster analysis metric

In [None]:
#from PF_metrics import *
#clus = cluster_metric(pd.read_csv('../data/real/diabetes.csv'), 
#                      synthetic_data, 
#                      num_clusters=69, 
#                      metadata=settings[0]['meta']['meta_data'])

In [None]:
#display(clus)