In [None]:
import pandas as pd
import numpy as np

# Generate random data for TCP packets
np.random.seed(0)

# Source IP addresses
src_ips = np.random.randint(0, 256, size=(100, 4))
src_ips = ['.'.join(map(str, ip)) for ip in src_ips]

# Destination IP addresses
dst_ips = np.random.randint(0, 256, size=(100, 4))
dst_ips = ['.'.join(map(str, ip)) for ip in dst_ips]

# Source ports
src_ports = np.random.randint(0, 65536, size=100)

# Destination ports
dst_ports = np.random.randint(0, 65536, size=100)

# Packet lengths
packet_lengths = np.random.randint(0, 1500, size=100)

# Flags (e.g., SYN, ACK, FIN, etc.)
flags = np.random.randint(0, 2, size=(100, 6))

# Timing information (e.g., timestamp, duration, etc.)
timing_info = np.random.rand(100, 3)

# Create a Pandas DataFrame
data = {
    'src_ip': src_ips,
    'dst_ip': dst_ips,
    'src_port': src_ports,
    'dst_port': dst_ports,
    'packet_length': packet_lengths,
    'flags': [tuple(flag) for flag in flags],
    'timing_info': [tuple(ti) for ti in timing_info]
}

df = pd.DataFrame(data)

print(df.head())
dataset=df

           src_ip          dst_ip  src_port  dst_port  packet_length  \
0  172.47.117.192   198.199.18.92     61761      7303           1258   
1  67.251.195.103    43.83.177.41     18880     51125            284   
2    9.211.21.242  93.174.149.201      2757     29419            172   
3    36.87.70.216  89.242.224.219     12120     61486            301   
4   88.140.58.193   73.28.235.209     23842     65008            822   

                flags                                        timing_info  
0  (1, 1, 1, 1, 0, 0)  (0.32626963264937237, 0.31654255989247604, 0.4...  
1  (1, 0, 0, 1, 0, 0)  (0.43307744910126844, 0.3573468796779544, 0.91...  
2  (0, 1, 0, 1, 1, 0)  (0.7317441854328928, 0.7275469913315297, 0.289...  
3  (1, 1, 0, 0, 0, 0)  (0.5777094243168404, 0.779179433301834, 0.7955...  
4  (0, 1, 1, 1, 0, 1)  (0.34453046075431226, 0.7708727565686478, 0.73...  


In [None]:
dataset.info()
dataset.isnull()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   src_ip         100 non-null    int64 
 1   dst_ip         100 non-null    object
 2   src_port       100 non-null    int64 
 3   dst_port       100 non-null    int64 
 4   packet_length  100 non-null    int64 
 5   flags          100 non-null    object
 6   timing_info    100 non-null    object
dtypes: int64(4), object(3)
memory usage: 5.6+ KB


Unnamed: 0,src_ip,dst_ip,src_port,dst_port,packet_length,flags,timing_info
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
95,False,False,False,False,False,False,False
96,False,False,False,False,False,False,False
97,False,False,False,False,False,False,False
98,False,False,False,False,False,False,False


LOAD DATASET

In [None]:
# Preprocess the data (e.g., convert IP addresses to numerical values)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Convert IP addresses to strings
dataset['src_ip'] = dataset['src_ip'].astype(str)
dataset['dst_ip'] = dataset['dst_ip'].astype(str)

In [None]:
# Fit on all unique IP addresses to avoid unseen labels
all_ips = pd.concat([dataset['src_ip'], dataset['dst_ip']]).unique()
le.fit(all_ips)


In [None]:
df['src_ip'] = le.transform(dataset['src_ip'])
df['dst_ip'] = le.transform(dataset['dst_ip'])

# Split the data into features (X) and labels (y)
X = df.drop(['packet_length'], axis=1)
y = df['packet_length']

DEFINE GENERATOR

In [None]:
import torch
import torch.nn as nn


In [None]:

class TCPGenerator(nn.Module):
    def __init__(self, noise_dim, packet_len):
        super(TCPGenerator, self).__init__()
        self.noise_dim = noise_dim
        self.packet_len = packet_len

        # Define the Generator architecture
        self.fc1 = nn.Linear(noise_dim, 128)
        self.fc2 = nn.Linear(128, packet_len)

    def forward(self, noise):
        x = torch.relu(self.fc1(noise))
        x = self.fc2(x)
        return x

TRAIN GENERATOR

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define noise dimension
noise_dim = 100  # Example dimension, adjust as needed

# Assuming generator, criterion, optimizer, X, and y are already defined
# Example definitions (you should replace these with your actual model and data)
class Generator(nn.Module):
    def __init__(self, noise_dim, output_dim):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(noise_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim)  # Output layer size should match your input data
        )

    def forward(self, x):
        return self.fc(x)

# Define the dimensions of your data
input_dim = X.shape[1]

# Instantiate the generator
generator = Generator(noise_dim, input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(generator.parameters(), lr=0.001)



In [None]:
 #Training loop
for epoch in range(100):
    for i, x in enumerate(X):
        # Generate random noise
        noise = torch.randn(1, noise_dim)

        # Pass the noise through the Generator
        packet = generator(noise)

        # Ensure y[i] is a tensor and reshape if necessary
        target = torch.tensor(y[i], dtype=torch.float32).unsqueeze(0)

        # Calculate the loss
        loss = criterion(packet, target)

        # Backpropagate and update the weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 0.09172070771455765
Epoch 2, Loss: 0.7185438275337219
Epoch 3, Loss: 9.618498802185059
Epoch 4, Loss: 23.43510627746582
Epoch 5, Loss: 144.66905212402344
Epoch 6, Loss: 208.0687713623047
Epoch 7, Loss: 1339.874755859375
Epoch 8, Loss: 3619.889404296875
Epoch 9, Loss: 5613.48583984375
Epoch 10, Loss: 25642.361328125
Epoch 11, Loss: 39066.9609375
Epoch 12, Loss: 132366.0625
Epoch 13, Loss: 217578.109375
Epoch 14, Loss: 136526.234375
Epoch 15, Loss: 290949.125
Epoch 16, Loss: 112140.3046875
Epoch 17, Loss: 190184.390625
Epoch 18, Loss: 323974.9375
Epoch 19, Loss: 160661.921875
Epoch 20, Loss: 166402.015625
Epoch 21, Loss: 246151.703125
Epoch 22, Loss: 360575.0
Epoch 23, Loss: 238939.0
Epoch 24, Loss: 386444.21875
Epoch 25, Loss: 90770.5078125
Epoch 26, Loss: 739792.1875
Epoch 27, Loss: 222184.625
Epoch 28, Loss: 106099.7734375
Epoch 29, Loss: 245825.578125
Epoch 30, Loss: 201753.203125
Epoch 31, Loss: 215605.171875
Epoch 32, Loss: 255296.484375
Epoch 33, Loss: 169800.40625


GENERATE SYNTHETIC TCP PACKETS

In [None]:
# Generate synthetic TCP packets
num_packets = 100
synthetic_packets = []
for i in range(num_packets):
    noise = torch.randn(1, noise_dim)
    packet = generator(noise)
    synthetic_packets.append(packet.detach().numpy())

print(synthetic_packets)

[array([[311.81894, 311.88464, 311.84982, 311.8765 , 311.81912, 311.84134]],
      dtype=float32), array([[387.4142 , 387.57104, 387.53082, 387.60196, 387.5081 , 387.5693 ]],
      dtype=float32), array([[215.99814, 215.86511, 215.97322, 215.9709 , 215.90953, 215.88449]],
      dtype=float32), array([[294.74884, 294.81335, 294.81277, 294.69174, 294.88077, 294.7713 ]],
      dtype=float32), array([[357.17502, 357.18546, 357.20038, 357.2091 , 357.22263, 357.1022 ]],
      dtype=float32), array([[208.10425, 208.12431, 208.15016, 208.18968, 208.2063 , 208.12537]],
      dtype=float32), array([[612.05273, 612.1278 , 612.2265 , 612.0462 , 612.2088 , 612.1656 ]],
      dtype=float32), array([[444.09   , 444.17694, 444.14548, 444.1135 , 444.1973 , 444.1137 ]],
      dtype=float32), array([[449.3945 , 449.5246 , 449.4692 , 449.53662, 449.46652, 449.43024]],
      dtype=float32), array([[455.59732, 455.6312 , 455.57968, 455.62546, 455.53506, 455.69598]],
      dtype=float32), array([[333.12387, 

EVALUATE

In [None]:
import numpy as np

# Ensure synthetic_packets and y are numpy arrays
synthetic_packets = np.array(synthetic_packets)
y = np.array(y)

# Check the shapes
print(f'synthetic_packets shape: {synthetic_packets.shape}')
print(f'y shape: {y.shape}')

# Reshape y to add a dimension
y = y.reshape(-1, 1)  # Adds a dimension of size 1 to the end of y

# Broadcast y to match the first two dimensions of synthetic_packets
y_expanded = np.broadcast_to(y, synthetic_packets.shape[:2])

# Now you can calculate RMSE
rmse = np.sqrt(np.mean((synthetic_packets[:,:,0] - y_expanded) ** 2)) # Calculate RMSE for the first feature
print(f'RMSE: {rmse:.4f}')

synthetic_packets shape: (100, 1, 6)
y shape: (100, 1)
RMSE: 502.7848


In [None]:
# Calculate RMSE between generated and real packets
rmse = np.sqrt(np.mean((synthetic_packets - y) ** 2))
print(f'RMSE: {rmse:.4f}')


RMSE: 495.6425


In [None]:
# Check and print shapes
print(f'X shape: {X.shape}')
print(f'synthetic_packets shape: {synthetic_packets.shape}')


X shape: (100, 6)
synthetic_packets shape: (100, 6)


In [None]:
dataset.dtypes

src_ip            int64
dst_ip            int64
src_port          int64
dst_port          int64
packet_length     int64
flags            object
timing_info      object
dtype: object

In [None]:
# Ensure synthetic_packets have the same number of features as X
# Flatten if needed
if len(synthetic_packets.shape) > 2:
    synthetic_packets = synthetic_packets.reshape(synthetic_packets.shape[0], -1)

# Ensure X and synthetic_packets have the same number of features
if X.shape[1] != synthetic_packets.shape[1]:
    raise ValueError(f"The number of features in X ({X.shape[1]}) and synthetic_packets ({synthetic_packets.shape[1]}) must be the same.")

# Concatenate real and synthetic data
combined_data = np.concatenate((X, synthetic_packets))

# Create labels: 1 for real packets, 0 for synthetic packets
labels = np.concatenate((np.ones(len(X)), np.zeros(len(synthetic_packets))))


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Assuming X is your real packet data and synthetic_packets is generated by the generator

# Check and print shapes
print(f'X shape: {X.shape}')
print(f'synthetic_packets shape: {synthetic_packets.shape}')

# Flatten synthetic_packets if it has an extra dimension
if len(synthetic_packets.shape) > 2:
    synthetic_packets = synthetic_packets.reshape(synthetic_packets.shape[0], -1)

# Ensure X and synthetic_packets have the same number of features
if X.shape[1] != synthetic_packets.shape[1]:
    raise ValueError(f"The number of features in X ({X.shape[1]}) and synthetic_packets ({synthetic_packets.shape[1]}) must be the same.")

# Concatenate real and synthetic data
combined_data = np.concatenate((X, synthetic_packets))

# Create labels: 1 for real packets, 0 for synthetic packets
labels = np.concatenate((np.ones(len(X)), np.zeros(len(synthetic_packets))))

# Check if combined_data and labels are numpy arrays, convert if not
if not isinstance(combined_data, np.ndarray):
    combined_data = np.array(combined_data, dtype=np.float32)

if not isinstance(labels, np.ndarray):
    labels = np.array(labels, dtype=np.float32)


X shape: (100, 6)
synthetic_packets shape: (100, 6)


In [None]:
# Display types and shapes
print(f'combined_data type: {type(combined_data)}, shape: {combined_data.shape}')
print(f'labels type: {type(labels)}, shape: {labels.shape}')

combined_data type: <class 'numpy.ndarray'>, shape: (200, 6)
labels type: <class 'numpy.ndarray'>, shape: (200,)


In [None]:
# Iterate over combined_data to check for non-numeric elements
for i, row in enumerate(combined_data):
    for j, element in enumerate(row):
        if not isinstance(element, (int, float)):
            print(f"Non-numeric element found at index ({i}, {j}): {element}")
            # Replace or handle the non-numeric element as needed
            # For example, you could replace it with a default value:
            combined_data[i, j] = 0  # Replace with a suitable default value

# Train a classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(combined_data, labels)

Non-numeric element found at index (0, 4): (1, 1, 1, 1, 0, 0)
Non-numeric element found at index (0, 5): (0.32626963264937237, 0.31654255989247604, 0.44687696394619913)
Non-numeric element found at index (1, 4): (1, 0, 0, 1, 0, 0)
Non-numeric element found at index (1, 5): (0.43307744910126844, 0.3573468796779544, 0.9149707703156186)
Non-numeric element found at index (2, 4): (0, 1, 0, 1, 1, 0)
Non-numeric element found at index (2, 5): (0.7317441854328928, 0.7275469913315297, 0.2899134495919554)
Non-numeric element found at index (3, 4): (1, 1, 0, 0, 0, 0)
Non-numeric element found at index (3, 5): (0.5777094243168404, 0.779179433301834, 0.7955903685432131)
Non-numeric element found at index (4, 4): (0, 1, 1, 1, 0, 1)
Non-numeric element found at index (4, 5): (0.34453046075431226, 0.7708727565686478, 0.735893896807733)
Non-numeric element found at index (5, 4): (1, 0, 1, 0, 0, 0)
Non-numeric element found at index (5, 5): (0.14150648562190027, 0.8659454685664772, 0.4413214701804108)


In [None]:

# Train a classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(combined_data, labels)

print("Classifier trained successfully.")


Classifier trained successfully.


In [None]:

# Train a classifier to distinguish between real and synthetic packets
from sklearn.ensemble import RandomForestClassifier


In [None]:
# Iterate over both X and synthetic_packets to check for non-numeric elements
for arr in [X, synthetic_packets]:
    for i, row in enumerate(arr):
        for j, element in enumerate(row):
            if not isinstance(element, (int, float)):
                print(f"Non-numeric element found in array at index ({i}, {j}): {element}")
                # Replace or handle the non-numeric element as needed
                # For example, you could replace it with a default value:
                arr[i, j] = 0  # Replace with a suitable default value

# Train a classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(np.concatenate((X, synthetic_packets)), np.concatenate((np.ones(len(X)), np.zeros(len(synthetic_packets)))))

Non-numeric element found in array at index (0, 4): (1, 1, 1, 1, 0, 0)
Non-numeric element found in array at index (0, 5): (0.32626963264937237, 0.31654255989247604, 0.44687696394619913)
Non-numeric element found in array at index (1, 4): (1, 0, 0, 1, 0, 0)
Non-numeric element found in array at index (1, 5): (0.43307744910126844, 0.3573468796779544, 0.9149707703156186)
Non-numeric element found in array at index (2, 4): (0, 1, 0, 1, 1, 0)
Non-numeric element found in array at index (2, 5): (0.7317441854328928, 0.7275469913315297, 0.2899134495919554)
Non-numeric element found in array at index (3, 4): (1, 1, 0, 0, 0, 0)
Non-numeric element found in array at index (3, 5): (0.5777094243168404, 0.779179433301834, 0.7955903685432131)
Non-numeric element found in array at index (4, 4): (0, 1, 1, 1, 0, 1)
Non-numeric element found in array at index (4, 5): (0.34453046075431226, 0.7708727565686478, 0.735893896807733)
Non-numeric element found in array at index (5, 4): (1, 0, 1, 0, 0, 0)
Non-nu

In [None]:

# Evaluate the classifier
accuracy = clf.score(np.concatenate((X, synthetic_packets)), np.concatenate((np.ones(len(X)), np.zeros(len(synthetic_packets)))))
print(f'Classifier Accuracy: {accuracy:.4f}')

Classifier Accuracy: 1.0000


OUTPUT DISPLAY

In [None]:
# Output the synthetic TCP packets as a list of dictionaries
synthetic_packets_dict = []
for packet in synthetic_packets:
    packet_dict = {
        'src_ip': le.inverse_transform([int(packet[0])]),  # Convert packet[0] to integer
        'dst_ip': le.inverse_transform([int(packet[1])]),  # Convert packet[1] to integer
        'src_port': int(packet[2]),
        'dst_port': int(packet[3]),
        'packet_length': int(packet[4]),
        'flags': int(packet[5]),
        'timing_info': packet[6:]
    }
    synthetic_packets_dict.append(packet_dict) # Append the packet dictionary to the list

In [None]:
print("Synthetic TCP Packets:")
for packet in synthetic_packets:
    print(packet)

Synthetic TCP Packets:
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0

In [None]:
from tabulate import tabulate

print(tabulate(synthetic_packets, headers="keys", tablefmt="psql"))

+-----+-----+-----+-----+-----+-----+
|   0 |   1 |   2 |   3 |   4 |   5 |
|-----+-----+-----+-----+-----+-----|
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 |   0 |   0 |   0 |   0 |
|   0 |   0 