## Imports

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from functions import combine_first_n_datasets

## Network Traffic Data Processing

In [28]:
# Create a dataset that combines n datasets (there 2000 in the input_data folder)
path = 'data'
num_files = 129 # Half a day
combined_data = combine_first_n_datasets(folder_path=path, n_files=num_files)
combined_data_csv = combined_data.to_csv("real_data.csv", index=False)
print(len(combined_data))

16546


In [18]:
# Prepare the input data for use in a GAN by encoding categorical variables and scaling numerical values
# Encode categorical variables
le_source = LabelEncoder()
le_target = LabelEncoder()
combined_data['source'] = le_source.fit_transform(combined_data['source'])
combined_data['target'] = le_target.fit_transform(combined_data['target'])

# Normalize the demand values
scaler = MinMaxScaler()
combined_data['demand_value'] = scaler.fit_transform(combined_data[['demand_value']])

print(combined_data)

       source  target  demand_value
0           0       1      0.000146
1           0       2      0.001121
2           0       3      0.000068
3           0       4      0.000568
4           0       5      0.000280
...       ...     ...           ...
16541      11       6      0.013894
16542      11       7      0.074850
16543      11       8      0.149077
16544      11       9      0.000536
16545      11      10      0.041923

[16546 rows x 3 columns]


## GAN Architecture Definition

In [19]:
# Define input and output dimensions
input_dim = 10  # noise dimension
output_dim = 3  # source, target, demand_value

In [20]:
# Define the generator
# The generator takes a noise vector and processes it through 3 fully connected linear layers,
# and activation functions (ReLU) are applied to introduce non-linearity
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
        )
        self.source_target_output = nn.Linear(64, output_dim - 1)  # Outputs for source and target
        self.demand_output = nn.Sequential(
            nn.Linear(64, 1),
            nn.Softplus()  # Ensures positive demand values
        )

    def forward(self, x):
        hidden_output = self.model(x)  # This output has size 64
        source_target = self.source_target_output(hidden_output)  # Get source and target
        demand = self.demand_output(hidden_output)  # Get demand value
        return torch.cat((source_target, demand), dim=1)  # Concatenate source, target, and demand

In [21]:
# Define the discriminator
# The discriminator evaluates real vs generated data. It is made up of fully connected layers
# with LeakyReLU activations; the final layer a single probability indicating whether the input
# data is real (close to 1) or fake (close to 0)
class Discriminator(nn.Module):
    def __init__(self, output_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(output_dim, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 64),
            nn.LeakyReLU(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid() 
        )

    def forward(self, x):
        return self.model(x)

## Training

In [22]:
# Initialize models
generator = Generator(input_dim, output_dim)
discriminator = Discriminator(output_dim)

In [23]:
# Optimizers (for updating the weights of the generator and discriminator)
lr = 0.0002
optim_G = torch.optim.Adam(generator.parameters(), lr=lr)
optim_D = torch.optim.Adam(discriminator.parameters(), lr=lr)

# Loss function (for measuring the discrepancy between predicted and true labels)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss

# Training parameters
num_epochs = 1000
batch_size = 32
real_label = 1
fake_label = 0

In [24]:
for epoch in range(num_epochs):
    for _ in range(len(combined_data) // batch_size):
        # ---------------------
        # Train Discriminator
        # ---------------------
        # Real data
        real_data = torch.tensor(combined_data.sample(batch_size).values, dtype=torch.float32)
        label_real = torch.full((batch_size, 1), real_label, dtype=torch.float32)  # acts as the ground truth for the discriminator (each element has value 1)

        # Generate fake data
        noise = torch.randn(batch_size, input_dim)
        fake_data = generator(noise)  # The generator creates fakes data
        label_fake = torch.full((batch_size, 1), fake_label, dtype=torch.float32)  # similar to 'label_real', but full of 0's (fake data)

        # Train on real data
        optim_D.zero_grad()
        output_real = discriminator(real_data)
        loss_real = criterion(output_real, label_real)
        loss_real.backward()

        # Train on fake data
        output_fake = discriminator(fake_data.detach())
        loss_fake = criterion(output_fake, label_fake)
        loss_fake.backward()
        optim_D.step()

        # ---------------------
        # Train Generator
        # ---------------------
        optim_G.zero_grad()
        output = discriminator(fake_data)
        loss_G = criterion(output, label_real)  # Aim to fool the discriminator
        loss_G.backward()
        optim_G.step()

    # Print losses every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: Loss_D: {loss_real + loss_fake:.4f}, Loss_G: {loss_G:.4f}")

Epoch 0: Loss_D: 0.6863, Loss_G: 1.3160
Epoch 100: Loss_D: 1.3874, Loss_G: 0.7021
Epoch 200: Loss_D: 1.3727, Loss_G: 0.6939
Epoch 300: Loss_D: 1.3705, Loss_G: 0.7120
Epoch 400: Loss_D: 1.3804, Loss_G: 0.6750
Epoch 500: Loss_D: 1.3693, Loss_G: 0.7312
Epoch 600: Loss_D: 1.3475, Loss_G: 0.6859
Epoch 700: Loss_D: 1.3873, Loss_G: 0.6956
Epoch 800: Loss_D: 1.3491, Loss_G: 0.7079
Epoch 900: Loss_D: 1.3539, Loss_G: 0.7521


## Generate Synthetic Data

In [25]:
# Number of synthetic samples to generate
num_samples = len(combined_data)

# Generate synthetic data
synthetic_data = []
for _ in range(num_samples):
    # Generate noise
    noise = torch.randn(1, input_dim)
    generated_sample = generator(noise).detach().numpy()[0]
    
    # Round and convert source and target to integer indices
    source_idx = int(round(generated_sample[0]))
    target_idx = int(round(generated_sample[1]))
    
    # Ensure indices are within the range of the label encoders
    source_idx = min(max(source_idx, 0), len(le_source.classes_) - 1)
    target_idx = min(max(target_idx, 0), len(le_target.classes_) - 1)

    # Enforce the constraint that source and target should not be the same
    while source_idx == target_idx:
        noise = torch.randn(1, input_dim)  # Regenerate noise
        generated_sample = generator(noise).detach().numpy()[0]
        source_idx = int(round(generated_sample[0]))
        target_idx = int(round(generated_sample[1]))
        
        # Ensure indices are within the range again after regeneration
        source_idx = min(max(source_idx, 0), len(le_source.classes_) - 1)
        target_idx = min(max(target_idx, 0), len(le_target.classes_) - 1)

    # Map indices back to original categories
    synthetic_source = le_source.inverse_transform([source_idx])[0]
    synthetic_target = le_target.inverse_transform([target_idx])[0]

    # Inverse transform demand_value
    synthetic_demand_value = scaler.inverse_transform([[generated_sample[2]]])[0][0]
    
    # Append the row to the synthetic dataset
    synthetic_data.append([synthetic_source, synthetic_target, synthetic_demand_value])

In [26]:
# Convert synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=['source', 'target', 'demand_value'])
output_csv_path = 'synthetic_data.csv' 
synthetic_df.to_csv(output_csv_path, index=False)

## Check if there are any repeated values in the generated data

In [27]:
# Load the CSV file
file_path = 'synthetic_data.csv'
synthetic_data = pd.read_csv(file_path)

# Check for rows with identical source and target values
repeated_rows = synthetic_data[synthetic_data['source'] == synthetic_data['target']]

# Display the repeated rows, if any, and count of such rows
repeated_rows_count = repeated_rows.shape[0], repeated_rows
repeated_rows_count

(0,
 Empty DataFrame
 Columns: [source, target, demand_value]
 Index: [])