In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam

import os

In [2]:
latent_dim = 100
generator_checkpoint_path = 'generator_checkpoint.h5'
discriminator_checkpoint_path = 'discriminator_checkpoint.h5'
epochs = 2000
batch_size = 64
checkpoint_interval = 1000

In [8]:
file_path = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
df = pd.read_csv(file_path)

df.shape

(225745, 85)

In [23]:
file_path = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
df = pd.read_csv(file_path)

columns_to_drop = ["Flow ID", " Source IP", " Source Port", " Destination IP", 
                   " Destination Port", " Protocol", " Timestamp", " Fwd Header Length", 
                   " Bwd Header Length", " Fwd Packet Length Mean", " Bwd Packet Length Mean", 
                   " Bwd Packet Length Std", "Fwd Packets/s", " Bwd Packets/s", " Fwd IAT Mean", 
                   " Fwd IAT Std", " Bwd IAT Mean", " Bwd IAT Std", "Fwd PSH Flags", " Bwd PSH Flags", 
                   " Fwd URG Flags", " Bwd URG Flags", "Fwd Packets/s", " Bwd Packets/s", 
                   "Bwd Packet Length Max", " Bwd Packet Length Std", "Fwd Avg Bytes/Bulk", 
                   " Fwd Avg Packets/Bulk", " Fwd Avg Bulk Rate", " Bwd Avg Bytes/Bulk", 
                   " Bwd Avg Packets/Bulk", "Bwd Avg Bulk Rate", "Subflow Fwd Packets", 
                   " Subflow Bwd Packets", "Init_Win_bytes_forward", " Init_Win_bytes_backward", 
                   " act_data_pkt_fwd", " min_seg_size_forward"]
df.drop(columns_to_drop, axis=1, inplace=True)

# categorical_columns = df.select_dtypes(include=['object']).columns
# label_encoders = {}
# for col in categorical_columns:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col])
#     label_encoders[col] = le

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

scaler = MinMaxScaler()
benign_data = scaler.fit_transform(df[df[' Label'] == 'BENIGN'].drop(columns=[' Label']))
ddos_data = scaler.fit_transform(df[df[' Label'] == 'DDoS'].drop(columns=[' Label']))

print("Benign data min and max after scaling:", np.min(benign_data), np.max(benign_data))
print("DDoS data min and max after scaling:", np.min(ddos_data), np.max(ddos_data))

print(df.shape)
df

Benign data min and max after scaling: 0.0 1.0
DDoS data min and max after scaling: 0.0 1.0000000000000002
(225711, 50)


In [26]:
df[' Label'].unique()

array(['BENIGN', 'DDoS'], dtype=object)

In [15]:
file_path = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
df = pd.read_csv(file_path)
for feature in columns_to_drop:
    if(feature not in df.columns):
        print(feature, "feature doesnt exists")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 50 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Flow Duration                225745 non-null  int64  
 1    Total Fwd Packets            225745 non-null  int64  
 2    Total Backward Packets       225745 non-null  int64  
 3   Total Length of Fwd Packets   225745 non-null  int64  
 4    Total Length of Bwd Packets  225745 non-null  int64  
 5    Fwd Packet Length Max        225745 non-null  int64  
 6    Fwd Packet Length Min        225745 non-null  int64  
 7    Fwd Packet Length Std        225745 non-null  float64
 8    Bwd Packet Length Min        225745 non-null  int64  
 9   Flow Bytes/s                  225741 non-null  float64
 10   Flow Packets/s               225745 non-null  float64
 11   Flow IAT Mean                225745 non-null  float64
 12   Flow IAT Std                 225745 non-nul

In [11]:
total_columns = [' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Std', ' Bwd Packet Length Min', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Total', ' Bwd IAT Max', ' Bwd IAT Min', ' Min Packet Length',
       ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std',
       ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count',
       ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count',
       ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count',
       ' Down/Up Ratio', ' Average Packet Size', ' Avg Fwd Segment Size',
       ' Avg Bwd Segment Size', ' Fwd Header Length.1', ' Subflow Fwd Bytes',
       ' Subflow Bwd Bytes', 'Active Mean', ' Active Std', ' Active Max',
       ' Active Min', 'Idle Mean', ' Idle Std', ' Idle Max', ' Idle Min',
       ' Label']

print(len(total_columns))

50


In [4]:

def build_generator(input_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        BatchNormalization(momentum=0.8),
        Dense(256),
        LeakyReLU(alpha=0.2),
        BatchNormalization(momentum=0.8),
        Dense(512),
        LeakyReLU(alpha=0.2),
        BatchNormalization(momentum=0.8),
        Dense(benign_data.shape[1], activation='tanh')
    ])
    return model

In [5]:
def build_discriminator(input_dim):
    model = Sequential([
        Dense(512, input_dim=input_dim),
        LeakyReLU(alpha=0.2),
        Dense(256),
        LeakyReLU(alpha=0.2),
        Dense(1, activation='sigmoid')
    ])
    return model

In [6]:
def train_gan(benign_data, ddos_data, epochs, batch_size, checkpoint_interval):
    half_batch = batch_size // 2
    adam = Adam(learning_rate=0.0002, beta_1=0.5)

    generator = load_model(generator_checkpoint_path, compile=False) if os.path.exists(generator_checkpoint_path) else build_generator()
    discriminator = load_model(discriminator_checkpoint_path, compile=False) if os.path.exists(discriminator_checkpoint_path) else build_discriminator(benign_data.shape[1])
    
    discriminator.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

    z = Input(shape=(latent_dim,))
    generated_data = generator(z)
    discriminator.trainable = False
    validity = discriminator(generated_data)
    gan = Model(z, validity)
    gan.compile(loss='binary_crossentropy', optimizer=adam)

    for epoch in range(epochs):
        idx = np.random.randint(0, benign_data.shape[0], half_batch)
        real_data = benign_data[idx]
        noise = np.random.normal(0, 1, (half_batch, latent_dim))
        synthetic_data = generator.predict(noise)

        d_loss_real = discriminator.train_on_batch(real_data, np.ones((half_batch, 1)))
        d_loss_fake = discriminator.train_on_batch(synthetic_data, np.zeros((half_batch, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        valid_y = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch(noise, valid_y)

        print(f"Epoch {epoch+1}/{epochs} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]:.2f}] [G loss: {g_loss[0]}]")

        if (epoch + 1) % checkpoint_interval == 0:
            generator.save(generator_checkpoint_path)
            discriminator.save(discriminator_checkpoint_path)
            print(f"Checkpoint saved at epoch {epoch+1}")

    noise = np.random.normal(0, 1, (benign_data.shape[0], latent_dim))
    synthetic_ddos_data = generator.predict(noise)
    synthetic_ddos_data = scaler.inverse_transform(synthetic_ddos_data)
    synthetic_ddos_df = pd.DataFrame(synthetic_ddos_data, columns=df.columns.drop(' Label'))
    synthetic_ddos_df[' Label'] = 'DDoS'

    benign_df = pd.DataFrame(scaler.inverse_transform(benign_data), columns=df.columns.drop(' Label'))
    benign_df[' Label'] = 'BENIGN'

    mixed_df = pd.concat([benign_df, synthetic_ddos_df], ignore_index=True)
    mixed_df.to_csv("mixed_benign_ddos_data.csv", index=False)
    print("Mixed synthetic benign and DDoS data saved to mixed_benign_ddos_data.csv.")

train_gan(benign_data, ddos_data, epochs=epochs, batch_size=batch_size, checkpoint_interval=checkpoint_interval)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step




Epoch 1/2000 [D loss: 1.2602232694625854, acc.: 46.88] [G loss: [array(1.8427575, dtype=float32), array(1.8427575, dtype=float32), array(0.3125, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Epoch 2/2000 [D loss: 1.6493029594421387, acc.: 36.46] [G loss: [array(1.8416446, dtype=float32), array(1.8416446, dtype=float32), array(0.3125, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Epoch 3/2000 [D loss: 1.7245218753814697, acc.: 34.95] [G loss: [array(1.8396693, dtype=float32), array(1.8396693, dtype=float32), array(0.31770834, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Epoch 4/2000 [D loss: 1.7570526599884033, acc.: 33.48] [G loss: [array(1.839705, dtype=float32), array(1.839705, dtype=float32), array(0.3125, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Epoch 5/2000 [D loss: 1.7751789093017578, acc.: 34.31] [G loss



Epoch 1000/2000 [D loss: 1.8469305038452148, acc.: 31.72] [G loss: [array(1.8472244, dtype=float32), array(1.8472244, dtype=float32), array(0.317125, dtype=float32)]]
Checkpoint saved at epoch 1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Epoch 1001/2000 [D loss: 1.8469352722167969, acc.: 31.72] [G loss: [array(1.8472277, dtype=float32), array(1.8472277, dtype=float32), array(0.31710476, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Epoch 1002/2000 [D loss: 1.8469390869140625, acc.: 31.72] [G loss: [array(1.8472321, dtype=float32), array(1.8472321, dtype=float32), array(0.31711575, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Epoch 1003/2000 [D loss: 1.8469386100769043, acc.: 31.72] [G loss: [array(1.8472289, dtype=float32), array(1.8472289, dtype=float32), array(0.31714234, dtype=float32)]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Epoch 1



Epoch 2000/2000 [D loss: 1.8483422994613647, acc.: 31.69] [G loss: [array(1.8484895, dtype=float32), array(1.8484895, dtype=float32), array(0.31690624, dtype=float32)]]




Checkpoint saved at epoch 2000
[1m3053/3053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 9ms/step
Mixed synthetic benign and DDoS data saved to mixed_benign_ddos_data.csv.


## reading the generated output


In [27]:
syn_output = pd.read_csv('mixed_benign_ddos_data.csv')
syn_output.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Min,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,4.46472,1.005179,0.0,6.004459,0.0,6.007192,0.024457,3.010861,0.0,0.0,...,20.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,96.27981,1.0,0.002719,6.002229,0.013471,6.007192,0.024457,3.010861,0.0,6.0,...,20.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,46.907545,1.0,0.002719,6.002229,0.013471,6.007192,0.024457,3.010861,0.0,6.0,...,20.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,31.316303,1.0,0.002719,6.002229,0.013471,6.007192,0.024457,3.010861,0.0,6.0,...,20.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,4.46472,1.005179,0.0,6.004459,0.0,6.007192,0.024457,3.010861,0.0,0.0,...,20.384615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [28]:
df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Min,Flow Bytes/s,...,Subflow Bwd Bytes,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3,2,0,12,0,6,6,0.0,0,4000000.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,109,1,1,6,6,6,6,0.0,6,110091.7,...,6,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,52,1,1,6,6,6,6,0.0,6,230769.2,...,6,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,34,1,1,6,6,6,6,0.0,6,352941.2,...,6,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,3,2,0,12,0,6,6,0.0,0,4000000.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [30]:
for feature in df.columns:
    if(feature not in syn_output.columns):
        print(feature, "feature doesnt exists")

 Flow Duration feature doesnt exists


In [36]:
syn_output.rename(columns={'Flow Duration': ' Flow Duration'}, inplace=True)

## Dropping 21 columns

In [40]:
for feature in syn_output.columns:
    if(feature not in df.columns):
        print(feature, "feature doesnt exists")
        syn_output.drop(feature, inplace=True, axis=1)



 Fwd Packet Length Mean feature doesnt exists
 Bwd Packet Length Mean feature doesnt exists
 Bwd Packet Length Std feature doesnt exists
 Fwd IAT Mean feature doesnt exists
 Fwd IAT Std feature doesnt exists
 Bwd IAT Mean feature doesnt exists
 Bwd IAT Std feature doesnt exists
 Bwd PSH Flags feature doesnt exists
 Fwd URG Flags feature doesnt exists
 Bwd URG Flags feature doesnt exists
 Fwd Header Length feature doesnt exists
 Bwd Header Length feature doesnt exists
 Bwd Packets/s feature doesnt exists
 Fwd Avg Packets/Bulk feature doesnt exists
 Fwd Avg Bulk Rate feature doesnt exists
 Bwd Avg Bytes/Bulk feature doesnt exists
 Bwd Avg Packets/Bulk feature doesnt exists
 Subflow Bwd Packets feature doesnt exists
 Init_Win_bytes_backward feature doesnt exists
 act_data_pkt_fwd feature doesnt exists
 min_seg_size_forward feature doesnt exists


In [58]:
t1 = pd.read_csv('P1_real.csv')
for feature in syn_output.columns:
    if(feature not in df.columns):
        print(feature, "feature doesnt exists")


## Cropping Synthetic data 

In [45]:
akm = pd.read_csv('Master_sends_data.csv')
akm.shape

(195372, 50)

In [48]:
label_counts = akm[' Label'].value_counts()
print(label_counts)

 Label
BENIGN    97686
DDoS      97686
Name: count, dtype: int64


In [49]:
sampled_akm = akm.groupby(' Label').apply(lambda x: x.sample(n=6000, random_state=1)).reset_index(drop=True)


  sampled_akm = akm.groupby(' Label').apply(lambda x: x.sample(n=6000, random_state=1)).reset_index(drop=True)


In [52]:
sampled_label_counts = sampled_akm[' Label'].value_counts()
print(sampled_label_counts)

 Label
BENIGN    6000
DDoS      6000
Name: count, dtype: int64


### Sampling for P4

In [53]:
# Split into 5000+5000 for p2_akm and 1000+1000 for p4_akm
p2_akm = sampled_akm.groupby(' Label').apply(lambda x: x.iloc[:5000]).reset_index(drop=True)
p4_akm = sampled_akm.groupby(' Label').apply(lambda x: x.iloc[5000:6000]).reset_index(drop=True)

  p2_akm = sampled_akm.groupby(' Label').apply(lambda x: x.iloc[:5000]).reset_index(drop=True)
  p4_akm = sampled_akm.groupby(' Label').apply(lambda x: x.iloc[5000:6000]).reset_index(drop=True)


In [55]:
p4_label_counts = p4_akm[' Label'].value_counts()
print(p4_label_counts)

p2_label_counts = p2_akm[' Label'].value_counts()
print(p2_label_counts)

 Label
BENIGN    1000
DDoS      1000
Name: count, dtype: int64
 Label
BENIGN    5000
DDoS      5000
Name: count, dtype: int64


In [56]:
p2_akm.to_csv('p2_synthetic.csv',  index=False)
p4_akm.to_csv('p4_synthetic.csv',   index=False)