## Imports

In [None]:
from functions import combine_first_n_datasets
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

## Network Traffic Data Processing

In [None]:
# Create a dataset that combines n datasets (there 2000 in the input_data folder)
num_files = 50 
combined_data = combine_first_n_datasets(folder_path='data', n_files=num_files)
combined_data_csv = combined_data.to_csv("real_data.csv", index=False)
print(len(combined_data))

In [None]:
combined_data.head()

## GAN Definition

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(combined_data)

In [None]:
# An SDV synthesizer is an object that you can use to create synthetic data.
# It learns patterns from the real data and replicates them to generate synthetic data.
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(combined_data)

## Generate Synthetic Data

In [None]:
# Number of synthetic samples to generate
num_samples = len(combined_data)

# Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=num_samples)
synthetic_data.to_csv("synthetic_data.csv", sep=",", index=False)
synthetic_data.head()

## Evaluate Synthetic Data

In [None]:
diagnostic = run_diagnostic(
    real_data=combined_data,
    synthetic_data=synthetic_data,
    metadata=metadata)

The score is 100%, which means that the data is valid.

### Measure the statistical similarity between real and synthetic data

In [None]:
quality_report = evaluate_quality(
    combined_data,
    synthetic_data,
    metadata)

According to the results, the synthetic data is about 82.2% similar to the real data (in terms of statistical similarity).

## Visualizing the data

In [None]:
fig = get_column_plot(
    real_data=combined_data,
    synthetic_data=synthetic_data,
    column_name='source',
    metadata=metadata
)

fig.show()

In [None]:
fig = get_column_plot(
    real_data=combined_data,
    synthetic_data=synthetic_data,
    column_name='target',
    metadata=metadata
)

fig.show()