- This file is used for making train data (1) and test data (3) from all sensor
- The training data will consist of an accumulation of 70% from each sensor
- The test data will consist of 30% from each sensors

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np
import gc

print("Starting data processing and encoding script...")

# Set up the output directory
output_dir = 'final_dataset'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

# Define dataset file locations
dataset_files = {
    '1': 'sensor1/sensor1.binetflow',
    '2': 'sensor2/sensor2.binetflow',
    '3': 'sensor3/sensor3.binetflow',
}

# Initialize lists to store dataframes
df_list = []
loaded_keys = []
all_original_data = [] # For fitting the encoder

Starting data processing and encoding script...
Created output directory: final_dataset


In [2]:
print("Loading all raw sensor data for encoder fitting...")
for key, path in dataset_files.items():
    try:
        # Dtype specification for problematic columns
        dtype_spec = {'sTos': str, 'dTos': str}
        df = pd.read_csv(path, dtype=dtype_spec)
        
        # Manual handling for sTos and dTos columns
        if 'dTos' in df.columns:
            df['dTos'] = pd.to_numeric(df['dTos'], errors='coerce').fillna(0)
        if 'sTos' in df.columns:
            df['sTos'] = pd.to_numeric(df['sTos'], errors='coerce').fillna(0)
            
        df_list.append(df)
        all_original_data.append(df.copy()) # Save a raw copy for fitting
        loaded_keys.append(key)
        print(f"Successfully loaded: {path} (key: {key}, shape: {df.shape})")
        
    except FileNotFoundError:
        print(f"Warning: Dataset file not found at {path}. Skipping.")
    except Exception as e:
        print(f"Warning: Could not load {path}. Error: {e}. Skipping.")

# Stop the script if no data was successfully loaded
if not df_list:
    print("Error: No .binetflow datasets could be loaded. Stopping script.")
    # In a notebook, you might want to raise an Error instead of exit()
    # raise Exception("No data could be loaded")
else:
    print(f"Total {len(df_list)} datasets successfully loaded.")
    
del dtype_spec
del dataset_files
del path
del key
gc.collect()

Loading all raw sensor data for encoder fitting...
Successfully loaded: sensor1/sensor1.binetflow (key: 1, shape: (4895158, 18))
Successfully loaded: sensor2/sensor2.binetflow (key: 2, shape: (5998133, 18))
Successfully loaded: sensor3/sensor3.binetflow (key: 3, shape: (3885792, 18))
Total 3 datasets successfully loaded.


9

In [3]:
print("Fitting LabelEncoders on combined raw sensor data...")
combined_fit_df = pd.concat(all_original_data, ignore_index=True)

# Delete the raw data list to save memory
del all_original_data
gc.collect()

categorical_cols = ['SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State']
encoders = {}
imputation_values = {} # To store the encoded value of 'nan'

for col in categorical_cols:
    if col in combined_fit_df.columns:
        print(f"Fitting LabelEncoder for: {col}")
        # Ensure 'nan' is recognized as a string
        combined_fit_df[col] = combined_fit_df[col].fillna('nan').astype(str)
        unique_values = combined_fit_df[col].unique()
        
        le = LabelEncoder()
        le.fit(unique_values)
        encoders[col] = le
        
        # Save the encoding value for 'nan'
        if 'nan' in le.classes_:
            imputation_values[col] = int(le.transform(['nan'])[0])
            print(f"  Found NaN, its encoded ID is: {imputation_values[col]}")
        else:
            # If 'nan' was not present during fitting, use 0 as a fallback
            imputation_values[col] = 0 
            print(f"  No NaN found, using fallback ID: {imputation_values[col]}")
    else:
        print(f"Warning: Column '{col}' not found. Skipping encoding for it.")

print("All encoders are ready.")

# Delete the large combined dataframe
del combined_fit_df
gc.collect()

del unique_values
gc.collect()

Fitting LabelEncoders on combined raw sensor data...
Fitting LabelEncoder for: SrcAddr
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: Sport
  Found NaN, its encoded ID is: 74919
Fitting LabelEncoder for: Dir
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: DstAddr
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: Dport
  Found NaN, its encoded ID is: 92637
Fitting LabelEncoder for: State
  Found NaN, its encoded ID is: 405
All encoders are ready.


0

To make the data simpler, (following my paper) this code will remove the unnecessary feature such as:
- dTos
- sTos
- ActivityLabel (only in NCC-2)
- BotnetName (only in NCC-2)
- SensorId (only in NCC-2)
- StartTime

In [4]:
# 1. Define the label categorization function
def categorize_label(label):
    label_str = str(label).lower()
    if 'botnet' in label_str:
        if 'spam' in label_str: return 'botnet_spam'
        else: return 'botnet'
    elif 'background' in label_str or 'normal' in label_str: return 'normal'
    else: return 'normal' # Assume other labels as normal

# 2. Apply the function to all dataframes in df_list
for i in range(len(df_list)):
    df_list[i]['Label'] = df_list[i]['Label'].apply(categorize_label)
print("Label simplification complete.")

# 3. Drop unnecessary columns
columns_to_drop = ['dTos', 'sTos', 'ActivityLabel', 'BotnetName', 'SensorId', 'StartTime']
for i in range(len(df_list)):
    cols_to_drop_existing = [col for col in columns_to_drop if col in df_list[i].columns]
    if cols_to_drop_existing:
        df_list[i] = df_list[i].drop(columns=cols_to_drop_existing, errors='ignore')
print(f"Unnecessary columns have been dropped.")

del columns_to_drop
del cols_to_drop_existing
del col, i
gc.collect()

Label simplification complete.
Unnecessary columns have been dropped.


0

In [5]:
# Initialize lists to hold the split results
normal_train, normal_test = [], []
botnet_train, botnet_test = [], []
botnet_spam_train, botnet_spam_test = [], []
normal_df, botnet_df, botnet_spam_df = [], [], []

# 1. Separate each dataframe by label
for df in df_list:
    normal_df.append(df[df['Label'] == 'normal'])
    botnet_df.append(df[df['Label'] == 'botnet'])
    botnet_spam_df.append(df[df['Label'] == 'botnet_spam'])
    
del df_list # df_list is no longer needed
gc.collect()

# 2. Perform train_test_split on each category from each file
for i in range(len(loaded_keys)):
    # Split Normal
    if i < len(normal_df) and not normal_df[i].empty:
        tr, te = train_test_split(normal_df[i], test_size=0.3, random_state=42)
        normal_train.append(tr); normal_test.append(te)
    else:
        normal_train.append(pd.DataFrame()); normal_test.append(pd.DataFrame())
        
    # Split Botnet
    if i < len(botnet_df) and not botnet_df[i].empty:
        tr, te = train_test_split(botnet_df[i], test_size=0.3, random_state=42)
        botnet_train.append(tr); botnet_test.append(te)
    else:
        botnet_train.append(pd.DataFrame()); botnet_test.append(pd.DataFrame())

    # Split Botnet_Spam
    if i < len(botnet_spam_df) and not botnet_spam_df[i].empty:
        tr, te = train_test_split(botnet_spam_df[i], test_size=0.3, random_state=42)
        botnet_spam_train.append(tr); botnet_spam_test.append(te)
    else:
        botnet_spam_train.append(pd.DataFrame()); botnet_spam_test.append(pd.DataFrame())

print("Per-file stratified train-test split process complete.")

# Delete intermediate dataframes
del normal_df, botnet_df, botnet_spam_df
gc.collect()

del tr, te, i
gc.collect()

Per-file stratified train-test split process complete.


0

In [6]:
temp_train_df = []
num_files = len(loaded_keys)

# Combine all train categories per file
for i in range(num_files):
    file_dfs = []
    if i < len(normal_train) and not normal_train[i].empty: file_dfs.append(normal_train[i])
    if i < len(botnet_train) and not botnet_train[i].empty: file_dfs.append(botnet_train[i])
    if i < len(botnet_spam_train) and not botnet_spam_train[i].empty: file_dfs.append(botnet_spam_train[i])
    
    if file_dfs:
        temp_train_df.append(pd.concat(file_dfs, ignore_index=True))

del normal_train, botnet_train, botnet_spam_train
gc.collect()

if not temp_train_df:
    print("Error: No training data was generated.")
    # raise Exception("No training data")
else:
    # Combine all training data from all files into one
    train_df = pd.concat(temp_train_df, ignore_index=True)
    del temp_train_df
    gc.collect()
    
    # Shuffle the training data
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Combined training data created (shape: {train_df.shape})")
    # You can add train_df.head() here

Combined training data created (shape: (10345357, 12))


In [7]:
test_dfs = {} # Dictionary to store test sets per sensor

# Combine all test categories per file
for i in range(num_files):
    file_dfs = []
    if i < len(normal_test) and not normal_test[i].empty: file_dfs.append(normal_test[i])
    if i < len(botnet_test) and not botnet_test[i].empty: file_dfs.append(botnet_test[i])
    if i < len(botnet_spam_test) and not botnet_spam_test[i].empty: file_dfs.append(botnet_spam_test[i])
    
    if file_dfs:
        df = pd.concat(file_dfs, ignore_index=True)
        # Shuffle the test data
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        test_dfs[loaded_keys[i]] = df # Store with the sensor key (e.g., '1', '2')

del normal_test, botnet_test, botnet_spam_test
gc.collect()

print(f"Combined test data created (number of test sets: {len(test_dfs)})")

del i
del file_dfs
del df
del num_files
gc.collect()

Combined test data created (number of test sets: 3)


0

In [8]:
# Combine all dataframes (train + tests) to be processed together
all_dfs_to_transform = [train_df] + list(test_dfs.values())
del train_df # Save memory, train_df is already in the list
gc.collect()

final_dfs_list = [] # List to store the final results

# Define the target columns
protocol_cols_from_target = [
    'arp', 'esp', 'gre', 'icmp', 'igmp', 'ipv6', 'ipv6-icmp', 'ipx/spx', 'llc',
    'pim', 'rarp', 'rsvp', 'rtcp', 'rtp', 'tcp', 'udp', 'udt', 'unas', 'ipnip'
]
core_cols = ['Dur', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State',
             'TotPkts', 'TotBytes', 'SrcBytes', 'Label']
final_column_order = core_cols + protocol_cols_from_target

print("Applying transformations (Proto dummies, Encoders) to all datasets...")

for df_orig in all_dfs_to_transform:
    df = df_orig.copy()
    print(f"Processing DataFrame partition (Initial shape: {df.shape})")

    # 1. One-Hot Encoding for 'Proto'
    if 'Proto' in df.columns:
        df = pd.get_dummies(df, columns=['Proto'], prefix='', prefix_sep='')
        print(f"  Shape after get_dummies: {df.shape}")

    # 2. Apply LabelEncoder (Transform)
    for col in categorical_cols:
         if col in encoders: # If the column is in our encoder
            df[col] = df[col].fillna('nan').astype(str)
            
            # Separate known (from training) and unknown values
            known_mask = df[col].isin(encoders[col].classes_)
            known_indices = df.index[known_mask]
            unknown_indices = df.index[~known_mask]
            
            # Transform known values
            if not known_indices.empty:
                df.loc[known_indices, col] = encoders[col].transform(df.loc[known_indices, col])
            
            # Impute unknown values with the encoded 'nan' value
            unknown_fill_value = imputation_values.get(col, 0) # Get the 'nan' ID
            if not unknown_indices.empty:
                df.loc[unknown_indices, col] = unknown_fill_value
                
         elif col in df.columns: # If column exists but wasn't encoded
             # Fill with the default imputation value
             df[col] = imputation_values.get(col, 0) 

    # 3. Ensure all columns (including dummies) exist
    for col in final_column_order:
        if col not in df.columns:
            df[col] = 0 # Add missing dummy/protocol columns

    # 4. Set column order and take only required columns
    df_final = df[final_column_order].copy()

    # 5. Apply target data types
    print(f"  Applying target data types...")
    for col in df_final.columns:
        if col == 'Dur':
            df_final[col] = pd.to_numeric(df_final[col], errors='coerce').fillna(0).astype(np.float64)
        elif col == 'Label':
            df_final[col] = df_final[col].astype(object) # Label remains a string
        else:
            # All other columns (encoded IDs and dummies)
            df_final[col] = pd.to_numeric(df_final[col], errors='coerce').fillna(0).astype(np.int64)

    print(f"  Shape after finalizing columns and types: {df_final.shape}")
    final_dfs_list.append(df_final)
    
    del df, df_orig, df_final
    gc.collect()

print("All transformations complete.")

del all_dfs_to_transform
del col, categorical_cols, known_mask, known_indices, unknown_indices, unknown_fill_value, imputation_values
del protocol_cols_from_target, core_cols, final_column_order
gc.collect()

Applying transformations (Proto dummies, Encoders) to all datasets...
Processing DataFrame partition (Initial shape: (10345357, 12))
  Shape after get_dummies: (10345357, 29)
  Applying target data types...
  Shape after finalizing columns and types: (10345357, 30)
Processing DataFrame partition (Initial shape: (1468548, 12))
  Shape after get_dummies: (1468548, 27)
  Applying target data types...
  Shape after finalizing columns and types: (1468548, 30)
Processing DataFrame partition (Initial shape: (1799440, 12))
  Shape after get_dummies: (1799440, 27)
  Applying target data types...
  Shape after finalizing columns and types: (1799440, 30)
Processing DataFrame partition (Initial shape: (1165738, 12))
  Shape after get_dummies: (1165738, 24)
  Applying target data types...
  Shape after finalizing columns and types: (1165738, 30)
All transformations complete.


0

In [9]:
# Separate the processed list back into train and test
final_train_df = final_dfs_list[0]
final_test_dfs = {key: df for key, df in zip(test_dfs.keys(), final_dfs_list[1:])}

# 1. Save the training data
train_output_file = os.path.join(output_dir, 'train.csv')
final_train_df.to_csv(train_output_file, index=False)
print(f"Successfully saved ENCODED training data to {train_output_file} (shape: {final_train_df.shape})")

# 2. Save all test data
for key, df in final_test_dfs.items():
    test_output_file = os.path.join(output_dir, f"test_{key}.csv")
    df.to_csv(test_output_file, index=False)
    print(f"Successfully saved ENCODED test data to {test_output_file} (shape: {df.shape})")

print("\n--- Processing and encoding complete! ---")
print(f"Final files are in the '{output_dir}' directory.")

Successfully saved ENCODED training data to final_dataset\train.csv (shape: (10345357, 30))
Successfully saved ENCODED test data to final_dataset\test_1.csv (shape: (1468548, 30))
Successfully saved ENCODED test data to final_dataset\test_2.csv (shape: (1799440, 30))
Successfully saved ENCODED test data to final_dataset\test_3.csv (shape: (1165738, 30))

--- Processing and encoding complete! ---
Final files are in the 'final_dataset' directory.
