# Step 5: Measure Population Fidelity (PF)

In [1]:
import pandas as pd 
import sys
import os
import re

from sdv.metadata import SingleTableMetadata

sys.path.append('../src')
from PF_metrics import compute_all_pf_measures
from utils import (getExperimentConfig, 
                   getPicklesFromDir, 
                   get_synthetic_filepaths_from_original_data_id)

config = getExperimentConfig()
folders = config['folders']

settings = getPicklesFromDir(folders['settings_dir'])

In [2]:
# read file f it exists, if not-create dataframe
pf_measures_filepath = folders['pf_measures_filepath']

if os.path.exists(pf_measures_filepath):
    result_df = pd.read_csv(pf_measures_filepath)
else:
    result_df = pd.DataFrame(columns=['DatasetName'])

for dataset_setting in settings:

    original_data = pd.read_csv(folders['real_dir']+dataset_setting['meta']['filename'], dtype=dataset_setting['meta']['cols_dtype'])
    metadata = SingleTableMetadata().load_from_json(dataset_setting['meta']['meta_filepath']).to_dict()
    original_data_id = dataset_setting['meta']['id']
    synthetic_datasets = get_synthetic_filepaths_from_original_data_id(original_data_id)
    
    print(f"Data id: {original_data_id}")
    for sd_filename in synthetic_datasets:   
        sd_id = os.path.splitext(sd_filename)[0]
        quality = re.findall('Q\d+', sd_id)[0]
        sd_path = folders['sd_dir']+sd_filename

        
        # Check if the dataset has already been evaluated
        row_exists = ((result_df['DatasetName'] == sd_id)).any()
        if not row_exists:
            print(f"Computing PF on: {sd_id}")
            synthetic_data = pd.read_csv(sd_path)

            pf_measures = compute_all_pf_measures(original_data=original_data,
                                                  synthetic_data=synthetic_data,
                                                  dataset_meta=dataset_setting['meta'],
                                                  metadata=metadata,
                                                  SD_id=sd_id)
            pf_measures['Quality'] = quality
            pf_measures['Original dataset'] = original_data_id
            result_df = pd.concat([result_df, pf_measures], axis=0, ignore_index=True)

        # save the results
        result_df.to_csv(pf_measures_filepath, index=False)
result_df.to_html(f"{folders['result_dir']}pf_measures.html")

num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klusters:15
(1536, 9)
<class 'pandas.core.frame.DataFrame'>
num samples data: 1536, num_klus

KeyboardInterrupt: 