In [65]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Network Traffic Data Processing

In [66]:
def process_network_traffic_data(path):
    # Ingest the XML data
    tree = ET.parse(path)
    root = tree.getroot()

    # Define the namespace
    ns = {'net': 'http://sndlib.zib.de/network'}

    # Extract demand information
    demands = []
    for demand in root.findall('net:demands/net:demand', ns):
        source = demand.find('net:source', ns).text
        target = demand.find('net:target', ns).text
        demand_value = float(demand.find('net:demandValue', ns).text)
        demands.append({'source': source, 'target': target, 'demand_value': demand_value})

    df = pd.DataFrame(demands)
    return df

In [None]:
# Prepares the input data for use in a Generative Adversarial Network (GAN) by encoding categorical variables
# and scaling numerical values
data_path = 'input_data/demandMatrix-abilene-zhang-5min-20040910-2325.xml'
data = process_network_traffic_data(data_path)
print(data)

# Encode categorical variables
le_source = LabelEncoder()
le_target = LabelEncoder()
data['source'] = le_source.fit_transform(data['source'])
data['target'] = le_target.fit_transform(data['target'])

# Normalize the demand values
scaler = MinMaxScaler()
data['demand_value'] = scaler.fit_transform(data[['demand_value']])
print(data)

# GAN Architecture Definition

In [68]:
# Define input and output dimensions
input_dim = 10  # noise dimension
output_dim = 3  # source, target, demand_value

In [69]:
# Define the generator
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
        )

    def forward(self, x):
        return self.model(x)

In [70]:
# Define the discriminator
class Discriminator(nn.Module):
    def __init__(self, output_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(output_dim, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid() 
        )

    def forward(self, x):
        return self.model(x)

# Training

In [71]:
# Initialize models
generator = Generator(input_dim, output_dim)
discriminator = Discriminator(output_dim)

In [72]:
# Optimizers
lr = 0.0002
optim_G = torch.optim.Adam(generator.parameters(), lr=lr)
optim_D = torch.optim.Adam(discriminator.parameters(), lr=lr)

# Loss function
criterion = nn.BCELoss()

# Training parameters
num_epochs = 2000
batch_size = 32
real_label = 1
fake_label = 0

In [None]:
for epoch in range(num_epochs):
    for _ in range(len(data) // batch_size):
        # ---------------------
        # Train Discriminator
        # ---------------------
        # Real data
        real_data = torch.tensor(data.sample(batch_size).values, dtype=torch.float32)
        label_real = torch.full((batch_size, 1), real_label, dtype=torch.float32)

        # Generate fake data
        noise = torch.randn(batch_size, input_dim)
        fake_data = generator(noise)
        label_fake = torch.full((batch_size, 1), fake_label, dtype=torch.float32)

        # Train on real data
        optim_D.zero_grad()
        output_real = discriminator(real_data)
        loss_real = criterion(output_real, label_real)
        loss_real.backward()

        # Train on fake data
        output_fake = discriminator(fake_data.detach())
        loss_fake = criterion(output_fake, label_fake)
        loss_fake.backward()
        optim_D.step()

        # ---------------------
        # Train Generator
        # ---------------------
        optim_G.zero_grad()
        output = discriminator(fake_data)
        loss_G = criterion(output, label_real)  # Aim to fool the discriminator
        loss_G.backward()
        optim_G.step()

    # Print losses every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: Loss_D: {loss_real + loss_fake:.4f}, Loss_G: {loss_G:.4f}")

# Generate Synthetic Data

In [74]:
# Number of synthetic samples to generate
num_samples = len(data)

# Generate synthetic data
synthetic_data = []
for _ in range(num_samples):
    # Generate noise
    noise = torch.randn(1, input_dim)
    generated_sample = generator(noise).detach().numpy()[0]
    
    # Round and convert source and target to integer indices
    source_idx = int(round(generated_sample[0]))
    target_idx = int(round(generated_sample[1]))
    
    # Ensure indices are within the range of the label encoders
    source_idx = min(max(source_idx, 0), len(le_source.classes_) - 1)
    target_idx = min(max(target_idx, 0), len(le_target.classes_) - 1)

    # Map indices back to original categories
    synthetic_source = le_source.inverse_transform([source_idx])[0]
    synthetic_target = le_target.inverse_transform([target_idx])[0]

    # Inverse transform demand_value
    synthetic_demand_value = scaler.inverse_transform([[generated_sample[2]]])[0][0]
    
    # Append the row to the synthetic dataset
    synthetic_data.append([synthetic_source, synthetic_target, synthetic_demand_value])

In [75]:
# Convert synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=['source', 'target', 'demand_value'])
output_csv_path = 'output_data/synthetic_data.csv' 
synthetic_df.to_csv(output_csv_path, index=False)