In [5]:
import sdmetrics
import pandas as pd

real_data = pd.read_csv("/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/be_greater/datasets/adult.csv")
synthetic_data = pd.read_csv("/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/be_greater/results/adult.structured.csv")


def get_metadata(df):
    metadata = {
        'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
        'columns': {}
    }
    
    # Iterate over DataFrame columns
    for column_name, dtype in df.dtypes.items():
        col_metadata = {
            'sdtype': 'numerical' if pd.api.types.is_numeric_dtype(dtype) else 'categorical'
        }
    
        # Add additional attributes based on column type
        if pd.api.types.is_float_dtype(dtype):
            col_metadata['computer_representation'] = 'Float'
        elif pd.api.types.is_integer_dtype(dtype):
            col_metadata['computer_representation'] = 'Int64'
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            col_metadata['datetime_format'] = '%Y-%m-%d'
        elif pd.api.types.is_string_dtype(dtype):
            col_metadata['sdtype'] = 'id' if column_name == 'student_id' else 'categorical'
            col_metadata['regex_format'] = '\\d{30}' if column_name == 'student_id' else None
        elif pd.api.types.is_bool_dtype(dtype):
            col_metadata['sdtype'] = 'boolean'
        # Store column metadata
        metadata['columns'][column_name] = col_metadata 
    return metadata

In [18]:
from sdmetrics.reports.single_table import DiagnosticReport


metadata = get_metadata(real_data)
diagnostic = DiagnosticReport()
diagnostic.generate(real_data, synthetic_data, metadata)


Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 384.66it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 756.41it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%


In [19]:
from sdmetrics.reports.single_table import QualityReport

quality_report = QualityReport()
quality_report.generate(real_data, synthetic_data, metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 165.44it/s]|
Column Shapes Score: 90.55%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:01<00:00, 67.22it/s]|
Column Pair Trends Score: 68.44%

Overall Score (Average): 79.49%


In [20]:
from sdmetrics.visualization import get_column_plot

fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='age',
)

fig.show()

In [17]:
from sdmetrics.visualization import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_names=["age", "workclass"],
)

fig.show()



In [21]:
quality_report.get_visualization('Column Pair Trends')


In [23]:
quality = quality_report.get_properties()
Shape = quality['Score'][0]
Trend = quality['Score'][1]

save_dir = "/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/evaluation"

with open(f'{save_dir}/quality.txt', 'w') as f:
    f.write(f'{Shape}\n')
    f.write(f'{Trend}\n')

Quality = (Shape + Trend) / 2

shapes = quality_report.get_details(property_name='Column Shapes')
trends = quality_report.get_details(property_name='Column Pair Trends')
# coverages = diagnostic.get_details('Coverage')


shapes.to_csv(f'{save_dir}/shape.csv')
trends.to_csv(f'{save_dir}/trend.csv')
# coverages.to_csv(f'{save_dir}/coverage.csv')