In [1]:
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU, Dropout
from keras.optimizers import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import tensorflow as tf
from keras import layers, models
from scipy.sparse import issparse
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import netron
import matplotlib.pyplot as plt
from sklearn.pipeline import FunctionTransformer






In [2]:

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
df = pd.read_csv('netflow_with_geolocation.csv')

In [4]:
df = df.drop(['FLOW_ID','FLOW_DURATION_MILLISECONDS', 'LAST_SWITCHED', 'TCP_WIN_MAX_IN' ,'TCP_WIN_MAX_OUT' ,'TCP_WIN_MIN_IN' ,'TCP_WIN_MIN_OUT' ,'TCP_WIN_MSS_IN' ,'TCP_WIN_SCALE_IN', 'FIRST_SWITCHED' ,'TCP_WIN_SCALE_OUT' ,'SRC_TOS' ,'DST_TOS' ,'TOTAL_FLOWS_EXP' ,'MIN_IP_PKT_LEN' ,'MAX_IP_PKT_LEN' ,'TOTAL_PKTS_EXP' ,'TOTAL_BYTES_EXP','ID'], axis=1)

df.loc[df['src_country'] == 'ZZ', 'src_country'] = 'Private'
df.loc[df['dst_country'] == 'ZZ', 'dst_country'] = 'Private'

# Change city and region to 'Private' where country is now 'Private'
df.loc[df['src_country'] == 'Private', ['src_city', 'src_region']] = 'Private'
df.loc[df['dst_country'] == 'Private', ['dst_city', 'dst_region']] = 'Private'

# Set latitude and longitude to 0 where country is 'Private'
df.loc[df['src_country'] == 'Private', ['src_latitude', 'src_longitude']] = 0
df.loc[df['dst_country'] == 'Private', ['dst_latitude', 'dst_longitude']] = 0

In [5]:
df.head()

Unnamed: 0,PROTOCOL_MAP,L4_SRC_PORT,IPV4_SRC_ADDR,L4_DST_PORT,IPV4_DST_ADDR,PROTOCOL,TCP_FLAGS,IN_BYTES,IN_PKTS,OUT_BYTES,...,src_latitude,src_longitude,src_city,src_region,src_country,dst_latitude,dst_longitude,dst_city,dst_region,dst_country
0,udp,53950,10.114.232.40,53,10.114.226.5,17,0,165,2,275,...,0.0,0.0,Private,Private,Private,0.0,0.0,Private,Private,Private
1,tcp,37914,10.114.241.166,38303,10.114.224.218,6,22,44,1,40,...,0.0,0.0,Private,Private,Private,0.0,0.0,Private,Private,Private
2,tcp,33216,10.114.241.166,18757,10.114.224.116,6,22,44,1,40,...,0.0,0.0,Private,Private,Private,0.0,0.0,Private,Private,Private
3,udp,48627,10.114.225.205,53,10.114.226.5,17,0,128,2,160,...,0.0,0.0,Private,Private,Private,0.0,0.0,Private,Private,Private
4,udp,35939,10.114.225.205,53,10.114.226.5,17,0,172,2,300,...,0.0,0.0,Private,Private,Private,0.0,0.0,Private,Private,Private


In [6]:
missing_rows = df[df[['src_latitude', 'src_longitude', 'src_region', 'dst_region', 'dst_latitude', 'dst_longitude', 'dst_country']].isna().any(axis=1)]
df = df.dropna(subset=['src_latitude', 'src_longitude', 'dst_latitude', 'dst_longitude', 'dst_country', 'src_region', 'dst_region'])
print(df.isna().sum())
print(len(df))

PROTOCOL_MAP               0
L4_SRC_PORT                0
IPV4_SRC_ADDR              0
L4_DST_PORT                0
IPV4_DST_ADDR              0
PROTOCOL                   0
TCP_FLAGS                  0
IN_BYTES                   0
IN_PKTS                    0
OUT_BYTES                  0
OUT_PKTS                   0
ANALYSIS_TIMESTAMP         0
ANOMALY                66879
ALERT                 128357
src_latitude               0
src_longitude              0
src_city                   0
src_region                 0
src_country                0
dst_latitude               0
dst_longitude              0
dst_city                   0
dst_region                 0
dst_country                0
dtype: int64
148071


In [7]:
df['ANOMALY'] = pd.to_numeric(df['ANOMALY'], errors='coerce').fillna(0)
df['ALERT'] = pd.to_numeric(df['ALERT'], errors='coerce').fillna(0)


df['src_hierarchy'] = df['src_country'] + '>' + df['src_region'] + '>' + df['src_city']
df['dst_hierarchy'] = df['dst_country'] + '>' + df['dst_region'] + '>' + df['dst_city']


binary_cols = ['ANOMALY', 'ALERT']
count_cols = ['IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS']
categorical_cols = ['PROTOCOL_MAP', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'PROTOCOL', 'TCP_FLAGS', 'src_hierarchy', 'dst_hierarchy']
numerical_cols = ['L4_SRC_PORT', 'L4_DST_PORT', 'src_latitude', 'src_longitude', 'dst_latitude', 'dst_longitude']

# Setup preprocessing for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', OneHotEncoder(handle_unknown='ignore'), binary_cols),
        ('count', FunctionTransformer(np.log1p, validate=True), count_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)        
    ])

# Create a preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

df_preprocessed = pipeline.fit_transform(df)



In [8]:
# Making dataset smaller
size = 50000
df_preprocessed = df_preprocessed[:size]

In [9]:
if issparse(df_preprocessed):
    print("Data is sparsed")
    df_preprocessed = df_preprocessed.toarray()


Data is sparsed


In [10]:
def build_generator(latent_dim, data_shape):
    model = models.Sequential([
        layers.Dense(1024, activation="relu", input_dim=latent_dim),
        layers.BatchNormalization(),
        layers.Dense(512, activation="relu"),
        layers.BatchNormalization(),        
        layers.Dense(256, activation="relu"),
        layers.BatchNormalization(),        
        layers.Dense(128, activation="softmax"),
        layers.BatchNormalization(),                
        layers.Dense(np.prod(data_shape), activation="tanh"),
        layers.Reshape(data_shape)
    ])
    return model

def build_discriminator(data_shape):
    model = models.Sequential([
        layers.Flatten(input_shape=data_shape),
        layers.Dense(512, activation="relu"),
        layers.Dense(256, activation="relu"),
        layers.Dense(64, activation="softmax"),
        layers.Dense(1, activation="sigmoid")
    ])
    return model

def compile_gan(generator, discriminator):
    discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    discriminator.trainable = False  # Freeze the discriminator during generator training
    
    gan_input = layers.Input(shape=(latent_dim,))
    gan_output = discriminator(generator(gan_input))
    gan = models.Model(gan_input, gan_output)
    
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    return gan



In [11]:
def train_gan(generator, discriminator, gan, sparse_data, epochs, batch_size, latent_dim):
    gen_loss = []
    disc_loss = []
    for epoch in range(epochs):
        num_batches = int(np.ceil(sparse_data.shape[0] / batch_size))  # Calculate the number of batches based on the shape of sparse_data
        
        for batch_index in range(num_batches):
            start_idx = batch_index * batch_size
            end_idx = min((batch_index + 1) * batch_size, sparse_data.shape[0])
            actual_batch_size = end_idx - start_idx
            
            # Convert the current batch of sparse data to dense
            batch_data = sparse_data[start_idx:end_idx]
            
            # Generate noise
            noise = np.random.normal(0, 1, (actual_batch_size, latent_dim))
            
            # Generate fake data
            fake_data = generator.predict(noise)
            
            # Labels for real and fake data
            real_labels = np.ones((actual_batch_size, 1))
            fake_labels = np.zeros((actual_batch_size, 1))
            
            # Train the discriminator
            d_loss_real = discriminator.train_on_batch(batch_data, real_labels)[0]  # Assume first element is loss
            d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)[0]  # Assume first element is loss

            # Train the generator
            g_loss = gan.train_on_batch(noise, real_labels)
            
        # Record the loss values
        gen_loss.append(g_loss)
        disc_loss.append((d_loss_real + d_loss_fake) / 2)

        # Print progress        
        print(f"Epoch: {epoch+1}, D loss: {(d_loss_real + d_loss_fake) / 2}, G loss: {g_loss}")
    return gen_loss, disc_loss


In [12]:
latent_dim = 750  # Dimensionality of the noise input
data_shape = df_preprocessed.shape[1:]  # Assuming df_preprocessed is your preprocessed data reshaped appropriately

generator = build_generator(latent_dim, data_shape)
discriminator = build_discriminator(data_shape)
gan = compile_gan(generator, discriminator)

gen_loss, disc_loss = train_gan(generator, discriminator, gan, df_preprocessed, epochs=20, batch_size=64, latent_dim=latent_dim)





Epoch: 1, D loss: 0.2469882220029831, G loss: 1.5232793092727661
Epoch: 2, D loss: 0.12265154719352722, G loss: 2.161940097808838
Epoch: 3, D loss: 0.0648636594414711, G loss: 2.769906520843506
Epoch: 4, D loss: 0.035405270755290985, G loss: 3.36033034324646
Epoch: 5, D loss: 0.01962930429726839, G loss: 3.9420506954193115
Epoch: 6, D loss: 0.010967957321554422, G loss: 4.519583702087402
Epoch: 7, D loss: 0.006152870832011104, G loss: 5.095093250274658
Epoch: 8, D loss: 0.0034589306451380253, G loss: 5.669601917266846
Epoch: 9, D loss: 0.0019467147067189217, G loss: 6.243557453155518
Epoch: 10, D loss: 0.0010962782544083893, G loss: 6.817266464233398
Epoch: 11, D loss: 0.0006175936723593622, G loss: 7.3908305168151855
Epoch: 12, D loss: 0.0003480003651930019, G loss: 7.9642744064331055
Epoch: 13, D loss: 0.00019611755124060437, G loss: 8.537610054016113


KeyboardInterrupt: 

In [None]:
# Save the generator model to disk
generator.save('generator.h5')

# Save the discriminator model to disk
discriminator.save('discriminator.h5')

: 

In [None]:
# Plot the loss values
plt.plot(gen_loss, label='Generator loss')
plt.plot(disc_loss, label='Discriminator loss')
plt.title('GAN Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

: 

In [None]:
'''Model Visualizer'''
# Start the Netron model viewer for the generator model
# netron.start('generator.h5')

# Start the Netron model viewer for the discriminator model
# netron.start('discriminator.h5')

: 

In [None]:
def generate_data(generator, num_samples, latent_dim):
    # Generate noise
    noise = np.random.normal(0, 1, (num_samples, latent_dim))
    # Generate data
    generated_data = generator.predict(noise)
    return generated_data

: 

In [None]:
def convert_generated_data_to_text(generated_data, preprocessor):
    # Retrieve the transformers from the preprocessing pipeline
    binary_transformer = preprocessor.named_transformers_['binary']
    count_transformer = preprocessor.named_transformers_['count']
    categorical_transformer = preprocessor.named_transformers_['cat']
    standard_transformer = preprocessor.named_transformers_['num']

    # Calculate the number of features for each transformer's output
    num_binary_features = len(binary_transformer.get_feature_names_out())
    num_count_features = 4 # If FunctionTransformer, this may need to be manually set
    num_categorical_features = sum(len(cat) for cat in categorical_transformer.categories_)
    num_standard_features = len(standard_transformer.feature_names_in_)

    # Calculate the indices where each feature type starts and ends
    end_binary = num_binary_features
    start_count = end_binary
    end_count = start_count + num_count_features
    start_cat = end_count
    end_cat = start_cat + num_categorical_features
    start_standard = end_cat
    end_standard = start_standard + num_standard_features

    # Split the generated data back into its binary, count, categorical, and standard components
    binary_data = generated_data[:, :end_binary]
    count_data = generated_data[:, start_count:end_count]
    categorical_data = generated_data[:, start_cat:end_cat]
    standard_data = generated_data[:, start_standard:end_standard]

    # Inverse transform the binary data
    binary_data_inverse = binary_transformer.inverse_transform(binary_data)

    # Inverse transform the count data
    # Assuming count_transformer is a FunctionTransformer with np.log1p, inverse using np.expm1
    count_data_inverse = np.expm1(count_data)

    # Inverse transform the categorical data
    categorical_data_inverse = categorical_transformer.inverse_transform(categorical_data)

    # Inverse transform the standard data
    standard_data_inverse = standard_transformer.inverse_transform(standard_data)

    # Combine binary, count, categorical, and standard data
    combined_data = np.hstack((binary_data_inverse, count_data_inverse, categorical_data_inverse, standard_data_inverse))

    return combined_data


: 

In [None]:
# Example usage
num_samples = 1000  # Number of samples you want to generate
generated_data = generate_data(generator, num_samples, latent_dim)
generated_data[0]

: 

In [None]:
num_samples = 2000  # Number of samples you want to generate
generated_data = generate_data(generator, num_samples, latent_dim)

: 

In [None]:

# Assuming 'preprocessor' is your ColumnTransformer instance
combined_data = convert_generated_data_to_text(generated_data, preprocessor)

# Convert combined_data to DataFrame for easier viewing/manipulation
df_synthetic = pd.DataFrame(combined_data, columns=numerical_cols + categorical_cols + count_cols + binary_cols)

df_synthetic = df_synthetic[['PROTOCOL_MAP', 'L4_SRC_PORT', 'IPV4_SRC_ADDR', 'L4_DST_PORT', 'IPV4_DST_ADDR', 'PROTOCOL', 'TCP_FLAGS', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS', 'ANOMALY', 'ALERT', 'src_hierarchy', 'dst_hierarchy']]

# Convert to numeric, errors='coerce' will set non-numeric values to NaN
df_synthetic['L4_SRC_PORT'] = pd.to_numeric(df_synthetic['L4_SRC_PORT'], errors='coerce')
df_synthetic['L4_DST_PORT'] = pd.to_numeric(df_synthetic['L4_DST_PORT'], errors='coerce')
df_synthetic['PROTOCOL'] = pd.to_numeric(df_synthetic['PROTOCOL'], errors='coerce')
df_synthetic['TCP_FLAGS'] = pd.to_numeric(df_synthetic['TCP_FLAGS'], errors='coerce')
df_synthetic['IN_BYTES'] = pd.to_numeric(df_synthetic['IN_BYTES'], errors='coerce')
df_synthetic['IN_PKTS'] = pd.to_numeric(df_synthetic['IN_PKTS'], errors='coerce')
df_synthetic['OUT_BYTES'] = pd.to_numeric(df_synthetic['OUT_BYTES'], errors='coerce')
df_synthetic['OUT_PKTS'] = pd.to_numeric(df_synthetic['OUT_PKTS'], errors='coerce')
df_synthetic['ANOMALY'] = pd.to_numeric(df_synthetic['ANOMALY'], errors='coerce')
df_synthetic['ALERT'] = pd.to_numeric(df_synthetic['ALERT'], errors='coerce')



# Apply rounding and convert to integer
df_synthetic['L4_SRC_PORT'] = df_synthetic['L4_SRC_PORT'].round().astype(int)
df_synthetic['L4_DST_PORT'] = df_synthetic['L4_DST_PORT'].round().astype(int)
df_synthetic['PROTOCOL'] = df_synthetic['PROTOCOL'].round().astype(int)
df_synthetic['TCP_FLAGS'] = df_synthetic['TCP_FLAGS'].round().astype(int)
df_synthetic['IN_BYTES'] = df_synthetic['IN_BYTES'].round().astype(int)
df_synthetic['IN_PKTS'] = df_synthetic['IN_PKTS'].round().astype(int)
df_synthetic['OUT_BYTES'] = df_synthetic['OUT_BYTES'].round().astype(int)
df_synthetic['OUT_PKTS'] = df_synthetic['OUT_PKTS'].round().astype(int)
df_synthetic['ANOMALY'] = df_synthetic['ANOMALY'].round().astype(int)
df_synthetic['ALERT'] = df_synthetic['ALERT'].round().astype(int)


print(df_synthetic.head(1))

: 

In [None]:
df_synthetic.to_csv('synthetic_dataset_generated.csv', index=False)

: 