# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

# Load real and synthetic data

In [None]:
combined_data = pd.read_csv("real_data.csv", sep=",")
synthetic_data_sdv = pd.read_csv("synthetic_data_sdv.csv", sep=",")
synthetic_data_gan = pd.read_csv("synthetic_data_gan.csv", sep=",")

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(combined_data)

# Measure the statistical similarity between real and synthetic data

## SDV Generated Data

In [None]:
quality_report = evaluate_quality(
    combined_data,
    synthetic_data_sdv,
    metadata)

In [None]:
fig = get_column_plot(
    real_data=combined_data,
    synthetic_data=synthetic_data_sdv,
    column_name='source',
    metadata=metadata
)

fig.show()

In [None]:
fig = get_column_plot(
    real_data=combined_data,
    synthetic_data=synthetic_data_sdv,
    column_name='target',
    metadata=metadata
)

fig.show()

## GAN Generated Data

In [None]:
quality_report2 = evaluate_quality(
    combined_data,
    synthetic_data_gan,
    metadata)

In [None]:
fig = get_column_plot(
    real_data=combined_data,
    synthetic_data=synthetic_data_gan,
    column_name='source',
    metadata=metadata
)

fig.show()

In [None]:
fig = get_column_plot(
    real_data=combined_data,
    synthetic_data=synthetic_data_gan,
    column_name='target',
    metadata=metadata
)

fig.show()

# Link load comparison

To further extent the comparison between the real data and the synthetic data, we want to see how much does the link load (demand_value) varies between these two.

To do so, we will follow this approach:
1. For each dataset (real and synthetic), we will create a new dataset that stores, for each unique source-destination pair, the total link load. This is computed by summing up the demand value for each unique pair.
2. Calculate the absolute and relative difference in link loads.

$$
AD = |y - \hat{y}|
$$

$$
RD = \left| \frac{y - \hat{y}}{y} \right|
$$

In [None]:
# Group by source and target
real_clean = combined_data.groupby(["source", "target"])["demand_value"].sum().reset_index()
synthetic_gan_clean = synthetic_data_gan.groupby(["source", "target"])["demand_value"].sum().reset_index()
synthetic_sdv_clean = synthetic_data_sdv.groupby(["source", "target"])["demand_value"].sum().reset_index()

In [None]:
# Filter out rows with same source and destination
synthetic_gan_clean = synthetic_gan_clean[synthetic_gan_clean["source"] != synthetic_gan_clean["target"]]
synthetic_sdv_clean = synthetic_sdv_clean[synthetic_sdv_clean["source"] != synthetic_sdv_clean["target"]]

In [None]:
# Merge datasets
df_comparison_temp = pd.merge(real_clean, synthetic_gan_clean,
                         on=["source", "target"],
                         how="outer",
                         suffixes=("_real", "_gan")).fillna(0)

df_comparison = pd.merge(df_comparison_temp, synthetic_sdv_clean,
                         on=["source", "target"],
                         how="outer").fillna(0)

df_comparison.rename(columns={"demand_value": "demand_value_sdv"}, inplace=True)

In [None]:
# Calculate the absolute and relative differences
df_comparison["abs_diff_gan"] = abs(
    df_comparison["demand_value_real"] - df_comparison["demand_value_gan"])
df_comparison["rel_diff_gan"] = df_comparison["abs_diff_gan"] / df_comparison["demand_value_real"]

df_comparison["abs_diff_sdv"] = abs(
    df_comparison["demand_value_real"] - df_comparison["demand_value_sdv"])
df_comparison["rel_diff_sdv"] = df_comparison["abs_diff_sdv"] / df_comparison["demand_value_real"]

In [None]:
df_comparison

In [None]:
mad_gan = df_comparison['abs_diff_gan'].mean()
mrd_gan = df_comparison['rel_diff_gan'].mean()
print(f"Mean Absolute Difference (GAN): {mad_gan}")
print(f"Mean Relative Difference (GAN): {mrd_gan}")
print()

mad_sdv = df_comparison['abs_diff_sdv'].mean()
mrd_sdv = df_comparison['rel_diff_sdv'].mean()
print(f"Mean Absolute Difference (SDV): {mad_sdv}")
print(f"Mean Relative Difference (SDV): {mrd_sdv}")

In [None]:
# Set the style for the plots
sns.set(style="whitegrid")

# Create a figure and axis object
plt.figure(figsize=(12, 6))

# Plot the density for 'demand_value' from real and synthetic data
sns.kdeplot(combined_data['demand_value'], color='blue', label='Real Data', linewidth=2)
sns.kdeplot(synthetic_data_sdv['demand_value'], color='green', label='Synthetic Data (SDV)', linewidth=2)
sns.kdeplot(synthetic_data_gan['demand_value'], color='red', label='Synthetic Data (GAN)', linewidth=2)

# Add labels and title
plt.xlim(0, 200)
plt.title('Density of Demand Values', fontsize=16)
plt.xlabel('Demand Value', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.legend()
plt.show()

# Lowest link load comparison

In [None]:
# Group by 'source' and 'target', then get the minimum 'demand_value' for each group
min_demand_real = combined_data.loc[combined_data.groupby(['source', 'target'])['demand_value'].idxmin()]
min_demand_gan = synthetic_data_gan.loc[synthetic_data_gan.groupby(['source', 'target'])['demand_value'].idxmin()]
min_demand_sdv = synthetic_data_sdv.loc[synthetic_data_sdv.groupby(['source', 'target'])['demand_value'].idxmin()]

In [None]:
# Filter out rows with same source and destination
min_demand_gan = min_demand_gan[min_demand_gan["source"] != min_demand_gan["target"]]
min_demand_sdv = min_demand_sdv[min_demand_sdv["source"] != min_demand_sdv["target"]]

In [None]:
# Merge datasets
df_min_comparison_temp = pd.merge(min_demand_real, min_demand_gan,
                         on=["source", "target"],
                         how="outer",
                         suffixes=("_min_real", "_min_gan")).fillna(0)

df_min_comparison = pd.merge(df_min_comparison_temp, min_demand_sdv,
                         on=["source", "target"],
                         how="outer").fillna(0)

df_min_comparison.rename(columns={"demand_value": "demand_value_min_sdv"}, inplace=True)

In [None]:
# Calculate the absolute and relative differences
df_min_comparison["abs_diff_gan"] = abs(
    df_min_comparison["demand_value_min_real"] - df_min_comparison["demand_value_min_gan"])
df_min_comparison["rel_diff_gan"] = df_min_comparison["abs_diff_gan"] / df_min_comparison["demand_value_min_real"]

df_min_comparison["abs_diff_sdv"] = abs(
    df_min_comparison["demand_value_min_real"] - df_min_comparison["demand_value_min_sdv"])
df_min_comparison["rel_diff_sdv"] = df_min_comparison["abs_diff_sdv"] / df_min_comparison["demand_value_min_real"]

In [None]:
df_min_comparison

In [None]:
mad_gan = df_min_comparison['abs_diff_gan'].mean()
mrd_gan = df_min_comparison['rel_diff_gan'].mean()
print(f"Mean Absolute Difference (GAN): {mad_gan}")
print(f"Mean Relative Difference (GAN): {mrd_gan}")
print()

mad_sdv = df_min_comparison['abs_diff_sdv'].mean()
mrd_sdv = df_min_comparison['rel_diff_sdv'].mean()
print(f"Mean Absolute Difference (SDV): {mad_sdv}")
print(f"Mean Relative Difference (SDV): {mrd_sdv}")