# Imports

In [2]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Network Traffic Data Processing

In [3]:
def process_network_traffic_data(path):
    # Ingest the XML data
    tree = ET.parse(path)
    root = tree.getroot()

    # Define the namespace
    ns = {'net': 'http://sndlib.zib.de/network'}

    # Extract demand information
    demands = []
    for demand in root.findall('net:demands/net:demand', ns):
        source = demand.find('net:source', ns).text
        target = demand.find('net:target', ns).text
        demand_value = float(demand.find('net:demandValue', ns).text)
        demands.append({'source': source, 'target': target, 'demand_value': demand_value})

    df = pd.DataFrame(demands)
    return df

In [4]:
# Prepare the input data for use in a GAN by encoding categorical variables and scaling numerical values
data_path = 'input_data/demandMatrix-abilene-zhang-5min-20040910-2225.xml'
data = process_network_traffic_data(data_path)
print(data)

# Encode categorical variables
le_source = LabelEncoder()
le_target = LabelEncoder()
data['source'] = le_source.fit_transform(data['source'])
data['target'] = le_target.fit_transform(data['target'])

# Normalize the demand values
scaler = MinMaxScaler()
data['demand_value'] = scaler.fit_transform(data[['demand_value']])

print(data)

     source  target  demand_value
0    ATLAM5  ATLAng      0.396835
1    ATLAM5  CHINng      3.139035
2    ATLAM5  DNVRng      0.475848
3    ATLAM5  HSTNng      0.026667
4    ATLAM5  IPLSng      1.406133
..      ...     ...           ...
126  WASHng  KSCYng     39.996867
127  WASHng  LOSAng    139.748747
128  WASHng  NYCMng    210.302072
129  WASHng  SNVAng     11.043411
130  WASHng  STTLng     65.051659

[131 rows x 3 columns]
     source  target  demand_value
0         0       1      0.001760
1         0       2      0.014801
2         0       3      0.002136
3         0       4      0.000000
4         0       5      0.006560
..      ...     ...           ...
126      11       6      0.190085
127      11       7      0.664472
128      11       8      1.000000
129      11       9      0.052392
130      11      10      0.309237

[131 rows x 3 columns]


# GAN Architecture Definition

In [None]:
# Define input and output dimensions
input_dim = 10  # noise dimension
output_dim = 3  # source, target, demand_value

In [None]:
# Define the generator
# The generator takes a noise vector and processes it through 3 fully connected linear layers,
# and activation functions (ReLU) are applied to introduce non-linearity
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
        )
        self.source_target_output = nn.Linear(64, output_dim - 1)  # Outputs for source and target
        self.demand_output = nn.Sequential(
            nn.Linear(64, 1),
            nn.Softplus()  # Ensures positive demand values
        )

    def forward(self, x):
        hidden_output = self.model(x)  # This output has size 64
        source_target = self.source_target_output(hidden_output)  # Get source and target
        demand = self.demand_output(hidden_output)  # Get demand value
        return torch.cat((source_target, demand), dim=1)  # Concatenate source, target, and demand

In [None]:
# Define the discriminator
# The discriminator evaluates real vs generated data. It is made up of fully connected layers
# with LeakyReLU activations; the final layer a single probability indicating whether the input
# data is real (close to 1) or fake (close to 0)
class Discriminator(nn.Module):
    def __init__(self, output_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(output_dim, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid() 
        )

    def forward(self, x):
        return self.model(x)

# Training

In [None]:
# Initialize models
generator = Generator(input_dim, output_dim)
discriminator = Discriminator(output_dim)

In [None]:
# Optimizers (for updating the weights of the generator and discriminator)
lr = 0.0002
optim_G = torch.optim.Adam(generator.parameters(), lr=lr)
optim_D = torch.optim.Adam(discriminator.parameters(), lr=lr)

# Loss function (for measuring the discrepancy between predicted and true labels)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss

# Training parameters
num_epochs = 2000
batch_size = 32
real_label = 1
fake_label = 0

In [None]:
for epoch in range(num_epochs):
    for _ in range(len(data) // batch_size):
        # ---------------------
        # Train Discriminator
        # ---------------------
        # Real data
        real_data = torch.tensor(data.sample(batch_size).values, dtype=torch.float32)
        label_real = torch.full((batch_size, 1), real_label, dtype=torch.float32)  # acts as the ground truth for the discriminator (each element has value 1)

        # Generate fake data
        noise = torch.randn(batch_size, input_dim)
        fake_data = generator(noise)  # The generator creates fakes data
        label_fake = torch.full((batch_size, 1), fake_label, dtype=torch.float32)  # similar to 'label_real', but full of 0's (fake data)

        # Train on real data
        optim_D.zero_grad()
        output_real = discriminator(real_data)
        loss_real = criterion(output_real, label_real)
        loss_real.backward()

        # Train on fake data
        output_fake = discriminator(fake_data.detach())
        loss_fake = criterion(output_fake, label_fake)
        loss_fake.backward()
        optim_D.step()

        # ---------------------
        # Train Generator
        # ---------------------
        optim_G.zero_grad()
        output = discriminator(fake_data)
        loss_G = criterion(output, label_real)  # Aim to fool the discriminator
        loss_G.backward()
        optim_G.step()

    # Print losses every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: Loss_D: {loss_real + loss_fake:.4f}, Loss_G: {loss_G:.4f}")

# Generate Synthetic Data

In [None]:
# Number of synthetic samples to generate
num_samples = len(data)

# Generate synthetic data
synthetic_data = []
for _ in range(num_samples):
    # Generate noise
    noise = torch.randn(1, input_dim)
    generated_sample = generator(noise).detach().numpy()[0]
    
    # Round and convert source and target to integer indices
    source_idx = int(round(generated_sample[0]))
    target_idx = int(round(generated_sample[1]))
    
    # Ensure indices are within the range of the label encoders
    source_idx = min(max(source_idx, 0), len(le_source.classes_) - 1)
    target_idx = min(max(target_idx, 0), len(le_target.classes_) - 1)

    # Map indices back to original categories
    synthetic_source = le_source.inverse_transform([source_idx])[0]
    synthetic_target = le_target.inverse_transform([target_idx])[0]

    # Inverse transform demand_value
    synthetic_demand_value = scaler.inverse_transform([[generated_sample[2]]])[0][0]
    
    # Append the row to the synthetic dataset
    synthetic_data.append([synthetic_source, synthetic_target, synthetic_demand_value])

In [None]:
# Convert synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=['source', 'target', 'demand_value'])
output_csv_path = 'output_data/synthetic_data.csv' 
synthetic_df.to_csv(output_csv_path, index=False)

# Check if there are any repeated values in the generated data

In [14]:
# Load the CSV file
file_path = 'output_data/synthetic_data.csv'
synthetic_data = pd.read_csv(file_path)

# Check for rows with identical source and target values
repeated_rows = synthetic_data[synthetic_data['source'] == synthetic_data['target']]

# Display the repeated rows, if any, and count of such rows
repeated_rows_count = repeated_rows.shape[0], repeated_rows
repeated_rows_count

(12,
      source  target  demand_value
 3    STTLng  STTLng     45.007594
 5    DNVRng  DNVRng     70.265431
 6    IPLSng  IPLSng     54.982571
 9    WASHng  WASHng     53.779053
 19   ATLAM5  ATLAM5      1.542439
 33   HSTNng  HSTNng     20.217529
 35   WASHng  WASHng     48.909946
 62   LOSAng  LOSAng     65.500599
 87   HSTNng  HSTNng     10.028546
 88   HSTNng  HSTNng     75.168888
 113  LOSAng  LOSAng     53.296147
 122  WASHng  WASHng     50.659047)