In [105]:
import numpy as np
import pandas as pd


In [106]:
# Captures root folder
dataset_root_folder = "/run/media/avenortoz/Feishuo/univ/magister/opt/Malware-Project/BigDataset/IoTScenarios/"

# Capture names
capture_4_1 = dataset_root_folder + "CTU-Honeypot-Capture-4-1/bro/conn.log.labeled"
capture_5_1 = dataset_root_folder + "CTU-Honeypot-Capture-5-1/bro/conn.log.labeled"
capture_7_1 = dataset_root_folder + "CTU-Honeypot-Capture-7-1"
capture_1_1 = dataset_root_folder + "CTU-IoT-Malware-Capture-1-1"
capture_17_1 = dataset_root_folder + "CTU-IoT-Malware-Capture-17-1"
capture_20_1 = "CTU-IoT-Malware-Capture-20-1"
capture_21_1 = "CTU-IoT-Malware-Capture-21-1"
capture_3_1 = "CTU-IoT-Malware-Capture-3-1"
capture_33_1 = "CTU-IoT-Malware-Capture-33-1"
capture_34_1 = "CTU-IoT-Malware-Capture-34-1"
capture_35_1 = "CTU-IoT-Malware-Capture-35-1"
capture_36_1 = "CTU-IoT-Malware-Capture-36-1"
capture_39_1 = "CTU-IoT-Malware-Capture-39-1"
capture_42_1 = "CTU-IoT-Malware-Capture-42-1"
capture_43_1 = "CTU-IoT-Malware-Capture-43-1"
capture_44_1 = "CTU-IoT-Malware-Capture-44-1"
capture_48_1 = "CTU-IoT-Malware-Capture-48-1"
capture_49_1 = "CTU-IoT-Malware-Capture-49-1"
capture_52_1 = "CTU-IoT-Malware-Capture-52-1"
capture_60_1 = "CTU-IoT-Malware-Capture-60-1"
capture_7_1 = "CTU-IoT-Malware-Capture-7-1"
capture_8_1 = "CTU-IoT-Malware-Capture-8-1"
capture_9_1 = "CTU-IoT-Malware-Capture-9-1"

In [107]:
dtype_mapping = {
    "string": "object",
    "addr": "object",  # IP addresses can remain as object type for easy handling with strings
    "port": "Int64",  # Use Int64 to handle NaNs in integer columns
    "enum": "object",  # Enumerations as strings
    "interval": "float64",  # Duration in seconds as float
    "count": "Int64",  # For integer counts, handle NaNs
    "bool": "boolean",  # For boolean fields
    "set[string]": "object"  # Set of strings can be treated as a single string object, process later if needed
}

columns = ["ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto", "service", "duration",
           "orig_bytes", "resp_bytes", "conn_state", "local_orig", "local_resp", "missed_bytes", "history",
           "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes", "tunnel_parents", "label", "detailed-label"]

dtypes = {
    "ts": "float64",  # We'll parse this into datetime after loading
    "uid": "string",
    "id.orig_h": dtype_mapping["addr"],
    "id.orig_p": dtype_mapping["port"],
    "id.resp_h": dtype_mapping["addr"],
    "id.resp_p": dtype_mapping["port"],
    "proto": dtype_mapping["enum"],
    "service": dtype_mapping["string"],
    "duration": dtype_mapping["interval"],
    "orig_bytes": dtype_mapping["count"],
    "resp_bytes": dtype_mapping["count"],
    "conn_state": dtype_mapping["string"],
    "local_orig": dtype_mapping["bool"],
    "local_resp": dtype_mapping["bool"],
    "missed_bytes": dtype_mapping["count"],
    "history": dtype_mapping["string"],
    "orig_pkts": dtype_mapping["count"],
    "orig_ip_bytes": dtype_mapping["count"],
    "resp_pkts": dtype_mapping["count"],
    "resp_ip_bytes": dtype_mapping["count"],
    "tunnel_parents": dtype_mapping["set[string]"],
    "label": dtype_mapping["string"],
    "detailed-label": dtype_mapping["string"]
}

## Data cleaning

In [108]:
def load_capture(capture_name, columns, dtype_mapping):
    df = pd.read_csv(
        capture_name,
        sep="\x09",
        skiprows=6,
        names=columns,
        low_memory=False
    )
    # First row is about header and starts with #fields (not needed) so we skip it
    # Second row is about types, we specified them in advance so we will skip it as well
    df.drop(index=[0, 1], inplace=True)

    # Last row in the Zeek file has no values just some metadata which is not needed
    df.drop(df.index[-1], inplace=True)

    # Here in Zeek file the last three columns are split by '  ' symbol and not by '\x09' so need to process them separately
    last_three = df[columns[-3]].str.split("   ", expand=True)  # Split on double spaces

    # Assign the split parts back to the respective columns
    df[columns[-3]] = last_three[0]
    df[columns[-2]] = last_three[1]
    df[columns[-1]] = last_three[2]

    for column, dtype in dtype_mapping.items():
        if column in df.columns:
            df[column] = df[column].astype(dtype)

    df['ts'] = pd.to_numeric(df['ts'], errors='coerce')
    df['ts'] = pd.to_datetime(df['ts'], unit='s', errors='coerce')
    return df


def process_missing_zeek_values(df, columns_to_process=None):
    if columns_to_process is None:
        df.replace("-", np.nan, inplace=True)
        return

    for column in columns_to_process:
        df[column].replace("-", np.nan, inplace=True)

    print(f"Missing values left: {df.isin(['-']).sum()}")
    return df

In [109]:
df_4_1 = load_capture(capture_5_1, columns, dtype_mapping)

In [110]:
possible_columns_with_empty = [
    'service',
    'local_orig',
    'local_resp',
    'tunnel_parents',
    'detailed-label',
]

df_4_1 = process_missing_zeek_values(df_4_1, possible_columns_with_empty)
df_4_1



Missing values left: ts                  0
uid                 0
id.orig_h           0
id.orig_p           0
id.resp_h           0
id.resp_p           0
proto               0
service             0
duration          298
orig_bytes        298
resp_bytes        298
conn_state          0
local_orig          0
local_resp          0
missed_bytes        0
history            86
orig_pkts           0
orig_ip_bytes       0
resp_pkts           0
resp_ip_bytes       0
tunnel_parents      0
label               0
detailed-label      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].replace("-", np.nan, inplace=True)
  df[column].replace("-", np.nan, inplace=True)


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
2,2018-09-21 09:40:22.965529919,CJAF5z3MDFg4XVDXB,0.0.0.0,68,255.255.255.255,67,udp,dhcp,8.322388,600,...,,0,D,2,656,0,0,,benign,
3,2018-09-21 09:41:37.732295036,CcYEFX3Qj9xdNX1ZCa,192.168.2.1,5353,224.0.0.251,5353,udp,dns,-,-,...,,0,D,1,391,0,0,,benign,
4,2018-09-21 09:41:37.732373953,CF6fCK1nFqvI1XxGM1,fe80::80e6:50ff:fe12:1464,5353,ff02::fb,5353,udp,dns,-,-,...,,0,D,1,411,0,0,,benign,
5,2018-09-21 09:41:37.732506990,C2dlio3MkdwWJ80y04,169.254.15.115,5353,224.0.0.251,5353,udp,dns,-,-,...,,0,D,1,391,0,0,,benign,
6,2018-09-21 09:41:39.498805046,COrF0t2uLErai9nfH5,0.0.0.0,68,255.255.255.255,67,udp,dhcp,60.555587,3000,...,,0,D,10,3280,0,0,,benign,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,2018-09-21 15:08:54.223381996,CrPQ4n1jVFxbm0mcj9,192.168.69.136,51443,239.255.255.250,1900,udp,,-,-,...,,0,D,1,153,0,0,,benign,
1372,2018-09-21 15:09:20.979739904,Ce0LkH33VlZBF8Vw93,fe80::106c:9e5b:3af8:9cf,143,ff02::16,0,icmp,,1.945701,40,...,,0,-,2,152,0,0,,benign,
1373,2018-09-21 15:08:55.379472971,CdWSpn4EA2vXfUBqI7,fe80::183b:7564:dbcc:3eca,143,ff02::16,0,icmp,,1.024087,40,...,,0,-,2,152,0,0,,benign,
1374,2018-09-21 15:08:24.559191942,ClKCqX3JOxjrti9lLh,fe80::1847:a1bd:8d13:f43c,143,ff02::16,0,icmp,,59.390351,80,...,,0,-,4,304,0,0,,benign,


In [111]:
# here i need to process data, basically it would a good idea to remove columns that i don't need, and process others
# other think to is to handle columns with types that are not numbes such as categorical, but i'm not sure should i do this at this step or later
# add new column to the dataset, so i will now from which the row come, teh same is with uid flag(maybe i will be able to use later to have access to the
# packete itself, not sure if it will be needed but would be nice to have just in case
# and of course process missing values and duplicates
def process_capture(df):
    pass

In [112]:

all_columns = [
    'ts', 'uid',
    'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto',
    'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state',
    'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts',
    'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents',
    'label', 'detailed-label'
]

numeric_columns = ["duration",
                   "orig_bytes",
                   "resp_bytes",
                   "missed_bytes",
                   "local_orig",
                   "local_resp",
                   "orig_pkts",
                   "orig_ip_bytes",
                   "resp_pkts",
                   "resp_ip_bytes"]

## Categories encoding
conn_state = {
    "S0": 0,
    "S1": 1,
    "S2": 2,
    "S3": 3,
    "SF": 4,
    "REJ": 5,
    "RSTO": 6,
    "RSTR": 7,
    "RSTOS0": 8,
    "RSTRH": 9,
    "SH": 10,
    "SHR": 11,
    "OTH": 12
}

detailed_label = {
    "Benign": 0,
    "Attack": 1,
    "C&C": 2,
    "C&C-FileDownload": 3,
    "C&C-HeartBeat": 4,
    "C&C-HeartBeat-Attack": 5,
    "C&C-HeartBeat-FileDownload": 6,
    "C&C-Mirai": 7,
    "C&C-PartOfAHorizontalPortScan": 8,
    "C&C-Torii": 9,
    "DDoS": 10,
    "FileDownload": 11,
    "Okiru": 12,
    "Okiru-Attack": 13,
    "PartOfAHorizontalPortScan": 14,
    "PartOfAHorizontalPortScan-Attack": 15,
}

label = {
    "benign": 0,
    "Malicious": 1
}

proto = {
    "icmp": 0,
    "tcp": 1,
    "udp": 2
}

service = {
    np.NaN: 0,
    "dhcp": 1,
    "dns": 2,
    "http": 3,
    "ssh": 4,
    "ssl": 5,
    "irc": 6
}


In [114]:
def handle_missing_values(df):
    df['detailed-label'].replace(np.nan, "Benign", inplace=True)

    numeric_columns_that_cannot_have_nan = [
        "duration", "orig_bytes", "resp_bytes", "missed_bytes", "orig_pkts", "orig_ip_bytes",
        "resp_pkts", "resp_ip_bytes", "history"
    ]

    # It's almost unlikely that these columns could have NaN values, but it still could happen
    # We have a lot of data so there's no reason to try and keep them, so we will remove them
    df.dropna(subset=numeric_columns_that_cannot_have_nan, inplace=True)

    return df


def clean_data(df):
    handle_missing_values(df)
    df.drop_duplicates(keep='first', inplace=True)


## Data preparation

In [115]:
def encode_categorical(df):
    df['conn_state'] = df['conn_state'].map(conn_state)
    df['detailed-label'] = df['detailed-label'].map(detailed_label)
    df['label'] = df['label'].map(label)
    df['service'] = df['service'].map(service)
    df['proto'] = df['proto'].map(proto)

    return df

In [116]:
df_4_1 = handle_missing_values(df_4_1)
df_4_1 = encode_categorical(df_4_1)
df_4_1

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
2,2018-09-21 09:40:22.965529919,CJAF5z3MDFg4XVDXB,0.0.0.0,68,255.255.255.255,67,2,1,8.322388,600,...,,0,D,2,656,0,0,,0,0
3,2018-09-21 09:41:37.732295036,CcYEFX3Qj9xdNX1ZCa,192.168.2.1,5353,224.0.0.251,5353,2,2,-,-,...,,0,D,1,391,0,0,,0,0
4,2018-09-21 09:41:37.732373953,CF6fCK1nFqvI1XxGM1,fe80::80e6:50ff:fe12:1464,5353,ff02::fb,5353,2,2,-,-,...,,0,D,1,411,0,0,,0,0
5,2018-09-21 09:41:37.732506990,C2dlio3MkdwWJ80y04,169.254.15.115,5353,224.0.0.251,5353,2,2,-,-,...,,0,D,1,391,0,0,,0,0
6,2018-09-21 09:41:39.498805046,COrF0t2uLErai9nfH5,0.0.0.0,68,255.255.255.255,67,2,1,60.555587,3000,...,,0,D,10,3280,0,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,2018-09-21 15:08:54.223381996,CrPQ4n1jVFxbm0mcj9,192.168.69.136,51443,239.255.255.250,1900,2,0,-,-,...,,0,D,1,153,0,0,,0,0
1372,2018-09-21 15:09:20.979739904,Ce0LkH33VlZBF8Vw93,fe80::106c:9e5b:3af8:9cf,143,ff02::16,0,0,0,1.945701,40,...,,0,-,2,152,0,0,,0,0
1373,2018-09-21 15:08:55.379472971,CdWSpn4EA2vXfUBqI7,fe80::183b:7564:dbcc:3eca,143,ff02::16,0,0,0,1.024087,40,...,,0,-,2,152,0,0,,0,0
1374,2018-09-21 15:08:24.559191942,ClKCqX3JOxjrti9lLh,fe80::1847:a1bd:8d13:f43c,143,ff02::16,0,0,0,59.390351,80,...,,0,-,4,304,0,0,,0,0


There is a few columns that i don't know how to handle missing data there:
- local_orig
- local_resp
- tunnel_parents

In [117]:
def handle_history_column(df):
    history_chars = {
        'S': "The originator sent a SYN segment.",
        'h': "The responder sent a SYN ACK segment.",
        'A': "The originator sent an ACK segment.",
        'D': "The originator sent at least one segment with payload data.",
        'a': "The responder replied with an ACK segment.",
        'd': "The responder replied with at least one segment with payload data.",
        'F': "The originator sent a FIN ACK segment.",
        'f': "The responder replied with a FIN ACK segment."
    }
    for char, description in history_chars.items():
        df[f'history_{char}'] = df['history'].apply(lambda x: 1 if char in str(x) else 0)

    # Drop the original 'history' column if no longer needed
    df.drop(columns=['history'], inplace=True)

    # Display the first few rows to check the new columns
    return df

In [118]:
# Question should we do this now or later?
df_4_1 = handle_history_column(df_4_1)
df_4_1

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,label,detailed-label,history_S,history_h,history_A,history_D,history_a,history_d,history_F,history_f
2,2018-09-21 09:40:22.965529919,CJAF5z3MDFg4XVDXB,0.0.0.0,68,255.255.255.255,67,2,1,8.322388,600,...,0,0,0,0,0,1,0,0,0,0
3,2018-09-21 09:41:37.732295036,CcYEFX3Qj9xdNX1ZCa,192.168.2.1,5353,224.0.0.251,5353,2,2,-,-,...,0,0,0,0,0,1,0,0,0,0
4,2018-09-21 09:41:37.732373953,CF6fCK1nFqvI1XxGM1,fe80::80e6:50ff:fe12:1464,5353,ff02::fb,5353,2,2,-,-,...,0,0,0,0,0,1,0,0,0,0
5,2018-09-21 09:41:37.732506990,C2dlio3MkdwWJ80y04,169.254.15.115,5353,224.0.0.251,5353,2,2,-,-,...,0,0,0,0,0,1,0,0,0,0
6,2018-09-21 09:41:39.498805046,COrF0t2uLErai9nfH5,0.0.0.0,68,255.255.255.255,67,2,1,60.555587,3000,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,2018-09-21 15:08:54.223381996,CrPQ4n1jVFxbm0mcj9,192.168.69.136,51443,239.255.255.250,1900,2,0,-,-,...,0,0,0,0,0,1,0,0,0,0
1372,2018-09-21 15:09:20.979739904,Ce0LkH33VlZBF8Vw93,fe80::106c:9e5b:3af8:9cf,143,ff02::16,0,0,0,1.945701,40,...,0,0,0,0,0,0,0,0,0,0
1373,2018-09-21 15:08:55.379472971,CdWSpn4EA2vXfUBqI7,fe80::183b:7564:dbcc:3eca,143,ff02::16,0,0,0,1.024087,40,...,0,0,0,0,0,0,0,0,0,0
1374,2018-09-21 15:08:24.559191942,ClKCqX3JOxjrti9lLh,fe80::1847:a1bd:8d13:f43c,143,ff02::16,0,0,0,59.390351,80,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## I don't know what to do with that nan colums, so i will actually drop them, at leas