# Step 5: Measure Population Fidelity (PF)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from math import sqrt
import sys

sys.path.append('../src')

from PF_metrics import *
from utils import *

In [2]:
config = getExperimentConfig()
settings = getPicklesFromDir(config['folders']['settings_dir'])
display(settings)

[{'meta': {'name': 'Diabetes',
   'id': 'D0',
   'filename': 'diabetes.csv',
   'target': 'Outcome',
   'ordinal_features': None,
   'numeric_features': ['DiabetesPedigreeFunction',
    'BMI',
    'Insulin',
    'Glucose',
    'Age',
    'SkinThickness',
    'BloodPressure',
    'Pregnancies'],
   'text_features': None,
   'categorical_features': None,
   'meta_data': {'fields': {'Pregnancies': {'type': 'numerical',
      'subtype': 'integer',
      'transformer': 'FloatFormatter'},
     'Glucose': {'type': 'numerical',
      'subtype': 'integer',
      'transformer': 'FloatFormatter'},
     'BloodPressure': {'type': 'numerical',
      'subtype': 'integer',
      'transformer': 'FloatFormatter'},
     'SkinThickness': {'type': 'numerical',
      'subtype': 'integer',
      'transformer': 'FloatFormatter'},
     'Insulin': {'type': 'numerical',
      'subtype': 'integer',
      'transformer': 'FloatFormatter'},
     'BMI': {'type': 'numerical',
      'subtype': 'float',
      'transform

In [3]:
original_data = pd.read_csv("../data/real/diabetes.csv")
sd0q1 = pd.read_csv("../data/synthetic/SD0Q1_0.csv")
sd0q2 = pd.read_csv("../data/synthetic/SD0Q2_0.csv")
#display(sd0q1)
#display(sd0q2)

In [4]:
sd0q1.drop(columns='Unnamed: 0', inplace=True)
#display(sd0q1.head())
sd0q2.drop(columns='Unnamed: 0', inplace=True)
#display(sd0q2.head())

In [5]:
from math import log
n_clusters = 70
categorical_indecies = [8]
metadata = settings[0]['meta']['meta_data']

c1 = cluster_metric(original_data=original_data, 
                             synthetic_data=sd0q1, 
                             num_clusters=n_clusters, 
                             metadata=metadata)

print(f"Cluster analysis metric, SD0Q1: {c1}, Log: {log(c1)}" )

num samples data: 1536, num_klusters:70


ValueError: Clustering algorithm could not initialize. Consider assigning the initial clusters manually.

In [None]:
pf_measures = {
    'DatasetName',
    'pMSE',
    'SpMSE',
    'Cluster_2.5',   # num of clusters = 2.5% of dataset_size
    'Cluster_5',     # num of clusters = 5% of dataset_size
    'Cluster_10',    # num of clusters = 10% of dataset_size
    'BNLikelihood',
    'BNLogLikelihood',
    'GMLogLikelihood',
    'KLDivergence',
    'ContinousKLDivergence',
    'DiscreteKLDivergence',
    'KSComplement',
    'CSTest',
    'CrCl', #Cross-classification 
}  
    


def compute_all_pf_measures(original_data:pd.DataFrame, synthetic_data:pd.DataFrame, metadata:dict, SD_id:str) -> pd.DataFrame:
    
    # get number of clusters, using the combined number of samples in the synthetic & original data, 
    # and round it to an integer
    one_percent = 0.01
    five_percent = 0.05
    ten_percent = 0.1
    
    k_1  = round( (original_data.shape[0] + synthetic_data.shape[0]) * one_percent)
    k_5  = round( (original_data.shape[0] + synthetic_data.shape[0]) * five_percent)
    k_10 = round( (original_data.shape[0] + synthetic_data.shape[0]) * ten_percent)

    
    measures = {
        'DatasetName': SD_id,
        
        'pMSE': pmse(original_data=original_data, synthetic_data=synthetic_data),
        
        'SpMSE': s_pmse(original_data=original_data, synthetic_data=synthetic_data),
        
        'Cluster_1': cluster_metric(original_data=original_data, 
                                    synthetic_data=synthetic_data, 
                                    num_clusters=k_1, 
                                    metadata=metadata),   
        
        'Cluster_5': cluster_metric(original_data=original_data, 
                                    synthetic_data=synthetic_data, 
                                    num_clusters=k_5, 
                                    metadata=metadata),  
        
        'Cluster_10': cluster_metric(original_data=original_data, 
                                    synthetic_data=synthetic_data, 
                                    num_clusters=k_10, 
                                    metadata=metadata), 
        
        'BNLikelihood': BNLikelihood_metric(original_data=original_data, 
                                            synthetic_data=synthetic_data, 
                                            metadata=metadata),
        
        'BNLogLikelihood': BNLogLikelihood_metric(original_data=original_data, 
                                                  synthetic_data=synthetic_data, 
                                                  metadata=metadata),
        
        'GMLogLikelihood': GmLogLikelihood_metric(original_data=original_data, 
                                                  synthetic_data=synthetic_data, 
                                                  metadata=metadata),

        'ContinuousKLDivergence': ContinousKLDivergence_metric(original_data, synthetic_data, metadata),
        'DiscreteKLDivergence': DiscreteKLDivergence_metric(original_data, synthetic_data, metadata),
        'KSComplement': KSComplement_metric(original_data, synthetic_data, metadata),
        'CSTest': CSTest_metric(original_data, synthetic_data, metadata),
        'CrossClassification': CrossClassification_metric(original_data, synthetic_data, metadata)
    }
    
    results_df = pd.DataFrame(data=measures, index=0)
    return results_df

    

In [None]:
meta = settings[0]['meta']['meta_data']
pf_measures = compute_all_pf_measures(original_data=original_data,
                                      synthetic_data=sd0q1,
                                      metadata=meta,
                                      SD_id='SD0Q1')

In [None]:
display(pf_measures)

In [None]:
spmse1 = s_pmse(original_data, sd0q1)
spmse2 = s_pmse(original_data, sd0q2)
print(f"S_pMSE: SD0Q1: {spmse1}, SD0Q2: {spmse2}" )

In [None]:
#display(sd0q1.head())
r1 = pmse(original_data, sd0q1)
r2 = pmse(original_data, sd0q2)
print(f"pMSE: SD0Q1: {r1}, SD0Q2: {r2}" )

In [None]:
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport
q1Report = QualityReport()
q2Report = QualityReport()

d1Report = DiagnosticReport()
d2Report = DiagnosticReport()
meta = settings[0]['meta']['meta_data']

display("SD0Q1")
# d1Report.generate(original_data, sd0q1, metadata)
q1Report.generate(original_data, sd0q1, meta)
display(q1Report.get_details(property_name='Column Shapes'))

display("SD0Q2")
# d2Report.generate(original_data, sd0q2, metadata)
q2Report.generate(original_data, sd0q2, meta)
display(q2Report.get_details(property_name='Column Shapes'))


In [None]:
# display("SD0Q1")
# d1fig = d1Report.get_visualization(property_name='Coverage')
# d1fig.show()
# q1fig = q1Report.get_visualization(property_name='Column Shapes')
# q1fig.show()

In [None]:
# display("SD0Q2")
# d2fig = d2Report.get_visualization(property_name='Coverage')
# d2fig.show()
# q2fig = q2Report.get_visualization(property_name='Column Shapes')
# q2fig.show()

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_pair_plot(
    real_data=original_data,
    synthetic_data=sd0q1,
    column_names= ['Glucose', 'Age'],
    metadata=metadata
    
)

fig.show()

Method to find optimal number of clusters for the clustering model using the silhouette analysis method.

In [None]:
#from kmodes.kprototypes import KPrototypes
#from sklearn.metrics import silhouette_score
#import matplotlib.pyplot as plt
#
#n_clusters = 20
#
#o_data = original_data.copy()
##o_data['S'] = 0
#
#s_data = sd0q1.copy()
##s_data['S'] = 1
#
#
#combined_data = pd.concat([o_data, s_data], axis=0)
#scaled_combined_data = standardize_select_columns(combined_data, [8])
#
#silhouette_scores = []
#
#k_range = range(2, 50)
#
#for k in k_range:
#
#    kproto = KPrototypes(n_clusters=k, init='Cao').fit(scaled_combined_data, categorical=[8])
#    silhouette_scores.append(silhouette_score(scaled_combined_data, kproto.labels_))
#
#fig, ax = plt.subplots()
#ax.plot(k_range, silhouette_scores, 'bx-')
#ax.set_title('Silhouette Score Method')
#ax.set_xlabel('Number of clusters')
#ax.set_ylabel('Silhouette Scores')
#plt.xticks(k_range)
#plt.tight_layout()
#plt.show()

In [None]:
#import numpy as np
#from sklearn.preprocessing import StandardScaler
## check implementation of cluster analysis
#o_data = original_data.copy()
#o_data['S'] = 0
#
#s_data = sd0q1.copy()
#s_data['S'] = 1
#
#
#combined_data = pd.concat([o_data, s_data], axis=0)
#scaled_combined_data = standardize_select_columns(combined_data, [8])
#
#k=4
#kproto = KPrototypes(n_clusters=k, init='Cao').fit(scaled_combined_data, categorical=[8])
#
#cluster_labels = kproto.labels_
#
#original_data_count = o_data.shape[0]    # number of samples in original data
#synthetic_data_count = s_data.shape[0]  # number of samples in synthetic data
#total_data_count = original_data_count + synthetic_data_count
#
#constant_c = original_data_count / (original_data_count + synthetic_data_count)
#
#display(cluster_labels)
#
#
#scaler = StandardScaler()
#column_indices = np.arange(combined_data.shape[1])
#
#columns_to_standardize = np.setdiff1d(column_indices, [8,9])
#
#combined_data.iloc[:, columns_to_standardize] = scaler.fit_transform(combined_data.iloc[:, columns_to_standardize])
#
##for cluster_id in range(k):
##
##    # TODO: add column, and identify dataset sample from the cluster
##    original_cluster_data_count = np.sum(cluster_labels[:original_data_count] == cluster_id)
##    synthetic_cluster_data_count = np.sum(cluster_labels[original_data_count:] == cluster_id)
##
##    total_cluster_data_count = original_cluster_data_count + synthetic_cluster_data_count