In [1]:
import os
import numpy as np
import pandas as pd
import glob

In [2]:
basedir = './GothamDataset2025/processed/'

In [29]:
import pandas as pd
import numpy as np

def extract_protocols_and_ports(df):
    """
    Extract protocol and port information from packet data.

    Parameters:
    df (pandas.DataFrame): Input dataset.

    Returns:
    pandas.DataFrame: Dataset with extracted protocol and port information.
    """
    src_ports, dst_ports, protocols = [], [], []
    
    # Initialize default values
    protocol = ""
    src_port = np.nan
    dst_port = np.nan
    
    for _, pkt in df.iterrows():
        if ":tcp" in pkt["frame.protocols"]:
            protocol = "TCP"
            src_port = int(pkt["tcp.srcport"])
            dst_port = int(pkt["tcp.dstport"])
        elif ":udp" in pkt["frame.protocols"]:
            protocol = "UDP"
            src_port = int(pkt["udp.srcport"])
            dst_port = int(pkt["udp.dstport"])
        elif ":icmp" in pkt["frame.protocols"]:
            protocol = "ICMP"
            src_port = np.nan
            dst_port = np.nan
        # Add a default case to handle packets that are not TCP, UDP, or ICMP
        else:
            # You might want to extract other protocols or set defaults
            protocol = pkt["frame.protocols"].split(':')[-1].upper() if ":" in pkt["frame.protocols"] else "UNKNOWN"
            src_port = np.nan
            dst_port = np.nan

        protocols.append(protocol)
        src_ports.append(src_port)
        dst_ports.append(dst_port)

    df["ip.protocol"] = protocols
    df["src.port"] = src_ports
    df["dst.port"] = dst_ports

    # Drop original columns, handling potential KeyErrors if they don't exist
    cols_to_drop = [
        "ip.proto",
        "tcp.srcport",
        "tcp.dstport",
        "udp.srcport",
        "udp.dstport",
    ]
    df.drop(
        columns=[col for col in cols_to_drop if col in df.columns],
        axis=1,
        inplace=True,
    )

    return df

def convert_ports(df, port_hierarchy_map_iot):
    """
    Convert ports to categorical values using a predefined port hierarchy.

    Parameters:
    df (pandas.DataFrame): Input dataset.
    port_hierarchy_map_iot (list): A list of tuples, where each tuple is (range, category_name).

    Returns:
    pandas.DataFrame: Dataset with converted ports.
    """
    def port_to_categories(port, port_hierarchy_map_iot):
        """Convert port number to category according to port_map."""
        # Handle non-numeric or NaN ports gracefully
        if not isinstance(port, (int, float)) or pd.isna(port):
            return "NONE" # Or "" if you prefer
            
        for p_range, p_name in port_hierarchy_map_iot:
            if port in p_range:
                return p_name

        return "OTHER" # Or "" for unmapped ports

    df["src.port"] = df["src.port"].apply(lambda port: port_to_categories(port, port_hierarchy_map_iot))
    df["dst.port"] = df["dst.port"].apply(lambda port: port_to_categories(port, port_hierarchy_map_iot))

    return df

def convert_checksums(df):
    """
    Convert checksum fields to integers, replacing missing values with a default value.
    This function is now robust to inputs being strings, numbers, or empty strings.
    """
    def _safe_hex_to_int(hex_val):
        """
        Safely convert a hex string (or other value) to an integer.
        Handles empty strings "" from fill_missing_values, NaN, 0, and -1.
        """
        if pd.isna(hex_val) or hex_val in ["", 0, -1, "0"]:
            return 0
        try:
            # Convert to string first to handle potential numeric inputs
            return int(str(hex_val), 16)
        except (ValueError, TypeError):
            # Return 0 if conversion fails
            return 0
            
    # Apply the safe hex conversion
    df["ip.checksum"] = df["ip.checksum"].apply(_safe_hex_to_int)
    df["tcp.checksum"] = df["tcp.checksum"].apply(_safe_hex_to_int)
    # tcp.options might also be hex, apply same logic
    df["tcp.options"] = df["tcp.options"].apply(_safe_hex_to_int).astype(float)

    return df

def group_labels(df):
    """
    Group the attack labels into broader categories.
    """
    attack_group = {
        "Normal": "Normal",
        "TCP Scan": "Network Scanning",
        "UDP Scan": "Network Scanning",
        "Telnet Brute Force": "Brute Force",
        "Reporting": "Infection",
        "Ingress Tool Transfer": "Infection",
        "File Download": "Infection",
        "CoAP Amplification": "DoS",
        "Merlin TCP Flooding": "DoS",
        "Merlin UDP Flooding": "DoS",
        "Merlin ICMP Flooding": "DoS",
        "Merlin C&C Communication": "C&C Communication",
        "Mirai TCP Flooding": "DoS",
        "Mirai UDP Flooding": "DoS",
        "Mirai GRE Flooding": "DoS",
        "Mirai C&C Communication": "C&C Communication",
    }

    # Create grouped label column
    df["label_category"] = df["label"].map(lambda x: attack_group.get(x, "Other"))
    return df

def unpack_flags(df):
    """
    Unpacks IP and TCP flags from hexadecimal strings into binary features,
    adds them to the DataFrame, and drops the original hex columns.
    Assumes fill_missing_values has run, replacing NaN with "".

    Parameters:
    df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.DataFrame: The DataFrame with binary flag columns added and original columns removed.
    """
    def _safe_hex_to_int(hex_val):
        """
        Safely convert a hex string (or other value) to an integer.
        Handles empty strings "" from fill_missing_values, NaN, 0, and -1.
        """
        if pd.isna(hex_val) or hex_val in ["", 0, -1, "0"]:
            return 0
        try:
            # Convert to string first to handle potential numeric inputs
            return int(str(hex_val), 16)
        except (ValueError, TypeError):
            # Return 0 if conversion fails
            return 0

    X = df.copy()

    # --- Unpack IP flags ---
    ip_flags_decimal = X["ip.flags"].apply(_safe_hex_to_int).values.astype(np.uint8)
    # Unpack and get the first 3 bits (rb, df, mf)
    ip_flags_binary = np.unpackbits(ip_flags_decimal.reshape((-1, 1)), axis=1, bitorder="little")[:, :3]
    
    # Define IP flag names
    ip_flag_names = ['ip.flag.rb', 'ip.flag.df', 'ip.flag.mf']
    
    # Create a DataFrame for IP flags
    ip_flags_df = pd.DataFrame(ip_flags_binary, columns=ip_flag_names, index=X.index)

    # --- Unpack TCP flags ---
    # This remains .astype(np.uint16) to hold values larger than 255
    tcp_flags_decimal = X["tcp.flags"].apply(_safe_hex_to_int).values.astype(np.uint16)
    
    # MODIFIED: Use .view(np.uint8) to present the data as 8-bit bytes to unpackbits
    # This reinterprets the (N,) uint16 array as an (N, 2) uint8 array
    tcp_flags_bytes = tcp_flags_decimal.view(np.uint8).reshape((-1, 2))
    
    # Unpackbits now works on the uint8 array, giving 16 bits (2 bytes * 8 bits)
    # We slice the 9 bits we need.
    tcp_flags_binary = np.unpackbits(tcp_flags_bytes, axis=1, bitorder="little")[:, :9]

    # Define TCP flag names (in order of bits from unpackbits)
    tcp_flag_names = [
        'tcp.flag.fin', 'tcp.flag.syn', 'tcp.flag.rst', 'tcp.flag.psh', 
        'tcp.flag.ack', 'tcp.flag.urg', 'tcp.flag.ece', 'tcp.flag.cwr', 'tcp.flag.ns'
    ]
    
    # Create a DataFrame for TCP flags
    tcp_flags_df = pd.DataFrame(tcp_flags_binary, columns=tcp_flag_names, index=X.index)

    # --- Combine with original DataFrame ---
    # Add the new binary columns
    X = pd.concat([X, ip_flags_df, tcp_flags_df], axis=1)

    # Drop the original hex columns, ignoring errors if they don't exist
    X = X.drop(columns=['ip.flags', 'tcp.flags'], errors='ignore')

    # Return the modified DataFrame
    return X

def fill_missing_values(df):
    """
    Fill missing values with appropriate placeholders.
    Numeric columns are filled with 0.
    Categorical (non-numeric) columns are filled with an empty string "".

    Parameters:
    df (pandas.DataFrame): Input dataset.

    Returns:
    pandas.DataFrame: Dataset with missing values filled.
    """
    # Get numerical columns
    num_cols = df.select_dtypes(include=["number"]).columns
    # Fill NaN in numerical columns with 0
    df[num_cols] = df[num_cols].fillna(0)

    # Get categorical (non-numerical) columns
    cat_cols = df.select_dtypes(exclude=["number"]).columns
    # Fill NaN in categorical columns with an empty string ""
    df[cat_cols] = df[cat_cols].fillna("")

    return df



In [30]:
features_to_drop = ["frame.time", "ip.dst", "ip.src", "ip.tos"]

In [31]:
PORT_HIERARCHY_MAP_IOT = [
    ([1883, 8883], "mqttPorts"),
    ([5683, 5684], "coapPorts"),
    ([8554, 8322, 8000, 8001, 8002, 8003, 1935, 8888], "rtspPorts"),
    ([80, 280, 443, 591, 593, 777, 488, 1183, 1184, 2069, 2301, 2381, 8008, 8080], "httpPorts"),
    ([24, 25, 50, 58, 61, 109, 110, 143, 158, 174, 209, 220, 406, 512, 585, 993, 995], "mailPorts"),
    ([42, 53, 81, 101, 105, 261], "dnsPorts"),
    ([20, 21, 47, 69, 115, 152, 189, 349, 574, 662, 989, 990], "ftpPorts"),
    ([22, 23, 59, 87, 89, 107, 211, 221, 222, 513, 614, 759, 992], "shellPorts"),
    ([512, 514], "remoteExecPorts"),
    ([13, 56, 113, 316, 353, 370, 749, 750], "authPorts"),
    ([229, 464, 586, 774], "passwordPorts"),
    ([114, 119, 532, 563], "newsPorts"),
    ([194, 258, 531, 994], "chatPorts"),
    ([35, 92, 170, 515, 631], "printPorts"),
    ([13, 37, 52, 123, 519, 525], "timePorts"),
    ([65, 66, 118, 150, 156, 217], "dbmsPorts"),
    ([546, 547, 647, 847], "dhcpPorts"),
    ([43, 63], "whoisPorts"),
    (range(137, 139 + 1), "netbiosPorts"),
    ([88, 748, 750], "kerberosPorts"),
    ([111, 121, 369, 530, 567, 593, 602], "RPCPorts"),
    ([161, 162, 391], "snmpPorts"),
    (range(0, 1024), "PRIVILEGED_PORTS"),
    (range(1024, 65536), "NONPRIVILEGED_PORTS")
]

In [None]:
files = glob.glob(os.path.join(basedir, "*.csv"))
for filename in files :

    print(f"Processing : {os.path.basename(filename)}")
    processed_chunks = []
    
    for chunk in pd.read_csv(filename, sep=",", low_memory=False, chunksize=10000):
        
        # Processing
        chunk.drop(labels=features_to_drop, axis=1, inplace=True)
        chunk = extract_protocols_and_ports(chunk)
        chunk = convert_checksums(chunk)
        chunk = group_labels(chunk)
        chunk = unpack_flags(chunk)
        chunk = fill_missing_values(chunk)

        #Store chunks to create dataframe later
        processed_chunks.append(chunk)
        chunk = convert_ports(chunk, PORT_HIERARCHY_MAP_IOT)
    
    df = pd.concat(processed_chunks, ignore_index=True)
    iot_device = os.path.basename(filename).rstrip(".csv")
    df.to_csv(os.path.join("./GothamDataset2025", "prepared", f"{iot_device}.csv"), index=False)

Processing : iotsim-air-quality-1.csv
