- This file is used for making train data (1) and test data (5) from all sensor
- The training data will consist of an accumulation of 70% from each sensor
- The test data will consist of 30% from each sensors

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
import numpy as np
import gc

print("Starting combined data processing and encoding script [NCC v15 - Standalone, Target Dtypes]")

Starting combined data processing and encoding script [NCC v15 - Standalone, Target Dtypes]


In [2]:
output_dir = 'final_dataset'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

# Definition of the dataset file to be loaded
dataset_files = {
    '1': 'scenario_dataset_1/dataset_result.binetflow',
    '2': 'scenario_dataset_2/dataset_result.binetflow',
    '5': 'scenario_dataset_5/dataset_result.binetflow',
    '9': 'scenario_dataset_9/dataset_result.binetflow',
    '13': 'scenario_dataset_13/dataset_result.binetflow'
}

In [3]:
df_list = []
loaded_keys = []
all_original_data = []

print("Loading all raw NCC dataframes for encoder fitting...")
for key, path in dataset_files.items():
    try:
        # Specify specific dtypes for problematic columns
        dtype_spec = {'sTos': str, 'dTos': str}
        df = pd.read_csv(path, dtype=dtype_spec)
        
        # Special handling for sTos and dTos after being loaded as strings
        if 'dTos' in df.columns:
            df['dTos'] = pd.to_numeric(df['dTos'], errors='coerce').fillna(0)
        if 'sTos' in df.columns:
            df['sTos'] = pd.to_numeric(df['sTos'], errors='coerce').fillna(0)
            
        df_list.append(df)
        all_original_data.append(df.copy()) # Create a copy for encoder fitting
        loaded_keys.append(key)
        print(f"Successfully loaded: {path} (key: {key}, shape: {df.shape})")
    except FileNotFoundError:
        print(f"Warning: Dataset file not found at {path}. Skipping.")
    except Exception as e:
        print(f"Warning: Could not load {path}. Error: {e}. Skipping.")

if not df_list:
    print("Error: No .binetflow datasets could be loaded. Stopping script.")
    # In a notebook, you may want to raise an error instead of using exit()
    # raise IOError("No .binetflow datasets could be loaded.")

del dtype_spec
del dataset_files
del path
del key
gc.collect()

Loading all raw NCC dataframes for encoder fitting...
Successfully loaded: scenario_dataset_1/dataset_result.binetflow (key: 1, shape: (10, 15))
Successfully loaded: scenario_dataset_2/dataset_result.binetflow (key: 2, shape: (10, 15))
Successfully loaded: scenario_dataset_5/dataset_result.binetflow (key: 5, shape: (10, 15))
Successfully loaded: scenario_dataset_9/dataset_result.binetflow (key: 9, shape: (10, 15))
Successfully loaded: scenario_dataset_13/dataset_result.binetflow (key: 13, shape: (10, 15))


0

In [4]:
print("Fitting LabelEncoders on combined raw NCC data...")
combined_fit_df = pd.concat(all_original_data, ignore_index=True)
del all_original_data # Delete raw merged copies as soon as possible
gc.collect()

categorical_cols = ['SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State']
encoders = {}
imputation_values = {}

for col in categorical_cols:
    if col in combined_fit_df.columns:
        print(f"Fitting LabelEncoder for: {col}")
        # Fill NaN with the string 'nan' so it can be encodeds
        combined_fit_df[col] = combined_fit_df[col].fillna('nan').astype(str)
        unique_values = combined_fit_df[col].unique()
        
        le = LabelEncoder()
        le.fit(unique_values)
        encoders[col] = le
        
        # Save the imputation value for 'nan'
        if 'nan' in le.classes_:
            imputation_values[col] = int(le.transform(['nan'])[0])
            print(f"  Found NaN, its encoded ID is: {imputation_values[col]}")
        else:
            # Fallback if 'nan' is not found (rarely occurs)
            imputation_values[col] = 0 
            print(f"  No NaN found, using fallback ID: {imputation_values[col]}")
    else:
        print(f"Warning: Column '{col}' not found in combined data. Skipping encoding for it.")

print("Encoders are ready.")
del combined_fit_df
gc.collect()

del unique_values
gc.collect()

Fitting LabelEncoders on combined raw NCC data...
Fitting LabelEncoder for: SrcAddr
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: Sport
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: Dir
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: DstAddr
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: Dport
  No NaN found, using fallback ID: 0
Fitting LabelEncoder for: State
  No NaN found, using fallback ID: 0
Encoders are ready.


0

To make the data simpler, (following my paper) this code will remove the unnecessary feature such as:
- dTos
- sTos
- ActivityLabel (only in NCC-2)
- BotnetName (only in NCC-2)
- SensorId (only in NCC-2)
- StartTime

In [5]:
# 1. Define the label categorization function
def categorize_label(label):
    label_str = str(label).lower()
    if 'botnet' in label_str:
        if 'spam' in label_str: return 'botnet_spam'
        else: return 'botnet'
    elif 'background' in label_str or 'normal' in label_str: return 'normal'
    else: return 'normal'

# 2. Apply the label categorization
for i in range(len(df_list)):
    df_list[i]['Label'] = df_list[i]['Label'].apply(categorize_label)
print("Applied label simplification.")

# 3. Drop unnecessary columns\
columns_to_drop = ['dTos', 'sTos', 'ActivityLabel', 'BotnetName', 'SensorId', 'StartTime']
for i in range(len(df_list)):
    cols_to_drop_existing = [col for col in columns_to_drop if col in df_list[i].columns]
    if cols_to_drop_existing:
        df_list[i] = df_list[i].drop(columns=cols_to_drop_existing, errors='ignore')
print(f"Dropped unnecessary columns (if they existed).")

del columns_to_drop
del cols_to_drop_existing
del col, i
gc.collect()

Applied label simplification.
Dropped unnecessary columns (if they existed).


0

In [6]:
normal_train, normal_test = [], []
botnet_train, botnet_test = [], []
botnet_spam_train, botnet_spam_test = [], []
normal_df, botnet_df, botnet_spam_df = [], [], []

# Separate by label
for df in df_list:
    normal_df.append(df[df['Label'] == 'normal'])
    botnet_df.append(df[df['Label'] == 'botnet'])
    botnet_spam_df.append(df[df['Label'] == 'botnet_spam'])
del df_list
gc.collect()

# Perform split for each file and each label
for i in range(len(loaded_keys)):
    if i < len(normal_df) and not normal_df[i].empty:
        tr, te = train_test_split(normal_df[i], test_size=0.3, random_state=42)
        normal_train.append(tr); normal_test.append(te)
    else:
        normal_train.append(pd.DataFrame()); normal_test.append(pd.DataFrame())
        
    if i < len(botnet_df) and not botnet_df[i].empty:
        tr, te = train_test_split(botnet_df[i], test_size=0.3, random_state=42)
        botnet_train.append(tr); botnet_test.append(te)
    else:
        botnet_train.append(pd.DataFrame()); botnet_test.append(pd.DataFrame())

    if i < len(botnet_spam_df) and not botnet_spam_df[i].empty:
        tr, te = train_test_split(botnet_spam_df[i], test_size=0.3, random_state=42)
        botnet_spam_train.append(tr); botnet_spam_test.append(te)
    else:
        botnet_spam_train.append(pd.DataFrame()); botnet_spam_test.append(pd.DataFrame())

print("Performed stratified train-test split.")
del normal_df, botnet_df, botnet_spam_df
gc.collect()

del tr, te, i
gc.collect()

Performed stratified train-test split.




0

In [7]:
temp_train_df = []
num_files = len(loaded_keys)

# Merge back by file
for i in range(num_files):
    file_dfs = []
    if i < len(normal_train) and not normal_train[i].empty: file_dfs.append(normal_train[i])
    if i < len(botnet_train) and not botnet_train[i].empty: file_dfs.append(botnet_train[i])
    if i < len(botnet_spam_train) and not botnet_spam_train[i].empty: file_dfs.append(botnet_spam_train[i])
    
    if file_dfs:
        temp_train_df.append(pd.concat(file_dfs, ignore_index=True))
        
del normal_train, botnet_train, botnet_spam_train
gc.collect()

if not temp_train_df:
     print("Error: No training data was generated.")
     # raise Exception("No training data generated")
else:
    train_df = pd.concat(temp_train_df, ignore_index=True)
    del temp_train_df
    gc.collect()
    
    # Shuffle the training data
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Combined training data (shape: {train_df.shape})")

Combined training data (shape: (30, 12))


In [8]:
test_dfs = {}
for i in range(num_files):
    file_dfs = []
    if i < len(normal_test) and not normal_test[i].empty: file_dfs.append(normal_test[i])
    if i < len(botnet_test) and not botnet_test[i].empty: file_dfs.append(botnet_test[i])
    if i < len(botnet_spam_test) and not botnet_spam_test[i].empty: file_dfs.append(botnet_spam_test[i])
    
    if file_dfs:
        df = pd.concat(file_dfs, ignore_index=True)
        # Shuffle each test data set
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        test_dfs[loaded_keys[i]] = df
        
del normal_test, botnet_test, botnet_spam_test
gc.collect()
print(f"Combined test data (number of test sets: {len(test_dfs)})")

del i
del file_dfs
del df
del num_files
gc.collect()

Combined test data (number of test sets: 5)


0

In [9]:
all_dfs_to_transform = [train_df] + list(test_dfs.values())
del train_df # Remove old references, we will process from the list
gc.collect()

final_dfs_list = []

# --- Define the target column list ---
protocol_cols_from_target = [
    'arp', 'esp', 'icmp', 'igmp', 'ipv6', 'ipv6-icmp', 'ipx/spx', 'llc',
    'pim', 'rarp', 'rtcp', 'rtp', 'tcp', 'udp', 'udt', 'unas', 'ipnip'
]
core_cols = ['Dur', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State',
             'TotPkts', 'TotBytes', 'SrcBytes', 'Label']
final_column_order = core_cols + protocol_cols_from_target
# ------------------------------------

print("Applying transformations (Proto dummies, fitted Encoders) to all datasets...")
for df_orig in all_dfs_to_transform:
    df = df_orig.copy()
    print(f"Processing DataFrame partition (Initial shape: {df.shape})")

    # 1. One-hot encoding 'Proto'
    if 'Proto' in df.columns:
        df = pd.get_dummies(df, columns=['Proto'], prefix='', prefix_sep='')
        print(f"  Shape after get_dummies: {df.shape}")

    # 2. Apply the pre-fitted LabelEncoder
    for col in categorical_cols:
         if col in encoders:
            df[col] = df[col].fillna('nan').astype(str)
            # Separate known and unknown values
            known_mask = df[col].isin(encoders[col].classes_)
            known_indices = df.index[known_mask]
            unknown_indices = df.index[~known_mask]
            
            # Transform known values
            if not known_indices.empty:
                df.loc[known_indices, col] = encoders[col].transform(df.loc[known_indices, col])
            
            # Fill unknown values with imputed value 'nan'
            unknown_fill_value = imputation_values.get(col, 0)
            if not unknown_indices.empty:
                df.loc[unknown_indices, col] = unknown_fill_value
         elif col in df.columns: # If the column exists but there is no encoder (this should not happen)
            df[col] = imputation_values.get(col, 0)

    # 3. Ensure all columns (including protocol) are present
    for col in final_column_order:
        if col not in df.columns:
            df[col] = 0 # Add missing protocol/other columns as 0

    # 4. Set column order and select only the necessary ones
    df_final = df[final_column_order].copy()

    # 5. Apply the target data type
    print(f"  Applying target data types...")
    for col in df_final.columns:
        if col == 'Dur':
            df_final[col] = pd.to_numeric(df_final[col], errors='coerce').fillna(0).astype(np.float64)
        elif col == 'Label':
            df_final[col] = df_final[col].astype(object) # Label remains as string
        else:
            # All other feature columns (including ID encode) should be int64
            df_final[col] = pd.to_numeric(df_final[col], errors='coerce').fillna(0).astype(np.int64)

    print(f"  Shape after finalizing columns and types: {df_final.shape}")
    final_dfs_list.append(df_final)
    
    del df, df_orig, df_final
    gc.collect()

print("Transformations complete.")

del all_dfs_to_transform
del col, categorical_cols, known_mask, known_indices, unknown_indices, unknown_fill_value, imputation_values
del protocol_cols_from_target, core_cols, final_column_order
gc.collect()

Applying transformations (Proto dummies, fitted Encoders) to all datasets...
Processing DataFrame partition (Initial shape: (30, 12))
  Shape after get_dummies: (30, 13)
  Applying target data types...
  Shape after finalizing columns and types: (30, 28)
Processing DataFrame partition (Initial shape: (4, 12))
  Shape after get_dummies: (4, 13)
  Applying target data types...
  Shape after finalizing columns and types: (4, 28)
Processing DataFrame partition (Initial shape: (4, 12))
  Shape after get_dummies: (4, 13)
  Applying target data types...
  Shape after finalizing columns and types: (4, 28)


Processing DataFrame partition (Initial shape: (4, 12))


  Shape after get_dummies: (4, 13)
  Applying target data types...
  Shape after finalizing columns and types: (4, 28)
Processing DataFrame partition (Initial shape: (4, 12))
  Shape after get_dummies: (4, 13)
  Applying target data types...
  Shape after finalizing columns and types: (4, 28)
Processing DataFrame partition (Initial shape: (4, 12))
  Shape after get_dummies: (4, 13)
  Applying target data types...
  Shape after finalizing columns and types: (4, 28)
Transformations complete.


0

In [10]:
# Separate the training and test data from the list again
final_train_df = final_dfs_list[0]
final_test_dfs = {key: df for key, df in zip(test_dfs.keys(), final_dfs_list[1:])}

# Save the training data
train_output_file = os.path.join(output_dir, 'train.csv')
final_train_df.to_csv(train_output_file, index=False)
print(f"Saved ENCODED training data to {train_output_file} (shape: {final_train_df.shape})")

# Save all test data
for key, df in final_test_dfs.items():
    test_output_file = os.path.join(output_dir, f"test_{key}.csv")
    df.to_csv(test_output_file, index=False)
    print(f"Saved ENCODED test data to {test_output_file} (shape: {df.shape})")

print("\n--- All-in-one processing and encoding complete for NCC! ---")
print(f"Final files are in the '{output_dir}' directory.")

Saved ENCODED training data to final_dataset\train.csv (shape: (30, 28))
Saved ENCODED test data to final_dataset\test_1.csv (shape: (4, 28))
Saved ENCODED test data to final_dataset\test_2.csv (shape: (4, 28))
Saved ENCODED test data to final_dataset\test_5.csv (shape: (4, 28))
Saved ENCODED test data to final_dataset\test_9.csv (shape: (4, 28))
Saved ENCODED test data to final_dataset\test_13.csv (shape: (4, 28))

--- All-in-one processing and encoding complete for NCC! ---
Final files are in the 'final_dataset' directory.
