## Imports

In [1]:
# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# If in Google Colab, clone the repository
if IN_COLAB:
    !git clone https://github.com/alexgaarciia/SyntheticTrafficRouting.git
    import os
    os.chdir('SyntheticTrafficRouting/')
    print("Repository cloned!")
else:
    print("Not in Google Colab, skipping repository clone.")

Not in Google Colab, skipping repository clone.


In [2]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality

## Network Traffic Data Processing

In [3]:
# Create a dataset that combines n datasets
num_files = 167
combined_data = pd.read_csv("real_data.csv", sep=",")
print(len(combined_data))

20514


In [4]:
combined_data.head()

Unnamed: 0,source,target,demand_value
0,ATLAM5,ATLAng,0.259509
1,ATLAM5,CHINng,0.58996
2,ATLAM5,DNVRng,0.156947
3,ATLAM5,HSTNng,0.208011
4,ATLAM5,IPLSng,0.408069


## GAN Definition

In [5]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(combined_data)

In [6]:
# An SDV synthesizer is an object that you can use to create synthetic data.
# It learns patterns from the real data and replicates them to generate synthetic data.
synthesizer = CTGANSynthesizer(metadata, epochs=1000, verbose=True)
synthesizer.fit(combined_data)

Gen. (-0.33) | Discrim. (0.03): 100%|██████████| 1000/1000 [1:09:22<00:00,  4.16s/it]


In [7]:
fig = synthesizer.get_loss_values_plot()
fig.show()

## Generate Synthetic Data

In [8]:
# Number of synthetic samples to generate
num_samples = len(combined_data)

# Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=num_samples)
synthetic_data.to_csv("synthetic_data_sdv.csv", sep=",", index=False)
synthetic_data.head()

Unnamed: 0,source,target,demand_value
0,SNVAng,IPLSng,0.812416
1,IPLSng,SNVAng,2.00611
2,WASHng,ATLAng,23.943307
3,ATLAng,SNVAng,2.916767
4,WASHng,KSCYng,2.99941


## Evaluate Synthetic Data

In [9]:
diagnostic = run_diagnostic(
    real_data=combined_data,
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 3/3 [00:00<00:00, 117.03it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 132.40it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



### Measure the statistical similarity between real and synthetic data

In [10]:
quality_report = evaluate_quality(
    combined_data,
    synthetic_data,
    metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 3/3 [00:00<00:00, 39.22it/s]|
Column Shapes Score: 92.43%

(2/2) Evaluating Column Pair Trends: |██████████| 3/3 [00:00<00:00, 28.23it/s]|
Column Pair Trends Score: 71.4%

Overall Score (Average): 81.91%

