In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from scipy.stats import wasserstein_distance

In [None]:
# Load your CSV file
csv_file_path = "synthetic_linux_logs_1K.csv"
sf1 = pd.read_csv(csv_file_path)

# Convert each row to string and save to a .txt file
output_file_path = "synthetic_linux_logs_1K.txt"
with open(output_file_path, "w") as f:
    for _, row in sf1.iterrows():
        f.write(" ".join(map(str, row.values)) + "\n")

In [None]:
# ------------------ PARSING FUNCTION ------------------ #
def clean_procname(raw):
    """Clean trailing symbols from procname like [ or quotes"""
    return re.sub(r'[\[\]"]+', '', raw)

def parse_log_line(line):
    # Primary pattern to extract timestamp, time_elapsed, host, event_name
    head_pattern = re.compile(
        r"\[(?P<timestamp>[\d:.]+)\] "
        r"\(\+(?P<time_elapsed>[\d.]+)\) "
        r"(?P<host_name>\w+) "
        r"(?P<event_name>[\w_]+):"
    )

    match = head_pattern.match(line)
    if not match:
        return None
    gd = match.groupdict()

    # Extract the rest after matched header
    rest = line[match.end():].strip()

    # Extract cpu_id from first group
    cpu_match = re.search(r"\{ cpu_id = (?P<cpu_id>\d+) \}", rest)
    cpu_id = int(cpu_match.group("cpu_id")) if cpu_match else None

    # Extract procname, pid, tid
    proc_match = re.search(r'\{ procname = "(?P<procname>[^"]+)", pid = (?P<pid>\d+), tid = (?P<tid>\d+)', rest)
    if proc_match:
        procname = clean_procname(proc_match.group("procname"))
        process_details = f'pid = {proc_match.group("pid")}, tid = {proc_match.group("tid")}'
    else:
        procname = None
        process_details = None

    # Extract kernel_details: use the last curly-braced group
    kernel_groups = re.findall(r"\{ ([^{}]+) \}", rest)
    kernel_details = kernel_groups[-1] if kernel_groups else None

    return {
        "timestamp": gd["timestamp"],
        "time_elapsed": float(gd["time_elapsed"]),
        "host_name": gd["host_name"],
        "event_name": gd["event_name"],
        "cpu_id": cpu_id,
        "procname": procname,
        "process_details": process_details,
        "kernel_details": kernel_details
    }

def parse_log_file(file_path):
    parsed_rows = []
    with open(file_path) as f:
        for line in f:
            row = parse_log_line(line)
            if row:
                parsed_rows.append(row)
    return pd.DataFrame(parsed_rows)

# ------------------ DATA LOADING ------------------ #
rf1 = parse_log_file("kernel_trace_1K.txt")
sf1 = parse_log_file("synthetic_linux_logs_1K.txt")

In [None]:
sf1["kernel_details"] = sf1["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf1['timestamp'] = sf1['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf1['host_name'] = sf1['host_name'].replace('mendax', 'synthetic-host')

In [None]:
sf1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985 entries, 0 to 984
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   timestamp        985 non-null    object 
 1   time_elapsed     985 non-null    float64
 2   host_name        985 non-null    object 
 3   event_name       985 non-null    object 
 4   cpu_id           979 non-null    float64
 5   procname         941 non-null    object 
 6   process_details  941 non-null    object 
 7   kernel_details   985 non-null    object 
dtypes: float64(2), object(6)
memory usage: 61.7+ KB


In [None]:
# Drop missing and align columns
common_cols = rf1.columns.intersection(sf1.columns)
rf1 = rf1[common_cols].dropna()
sf1 = sf1[common_cols].dropna()

In [None]:
# Load your CSV file
csv_file_path = "synthetic_linux_logs_10K.csv"
sf2 = pd.read_csv(csv_file_path)

# Convert each row to string and save to a .txt file
output_file_path = "synthetic_linux_logs_10K.txt"
with open(output_file_path, "w") as f:
    for _, row in sf2.iterrows():
        f.write(" ".join(map(str, row.values)) + "\n")

In [None]:
rf2 = parse_log_file("kernel_trace_10K.txt")
sf2 = parse_log_file("synthetic_linux_logs_10K.txt")

In [None]:
sf2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9987 entries, 0 to 9986
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   timestamp        9987 non-null   object 
 1   time_elapsed     9987 non-null   float64
 2   host_name        9987 non-null   object 
 3   event_name       9987 non-null   object 
 4   cpu_id           9987 non-null   int64  
 5   procname         9970 non-null   object 
 6   process_details  9970 non-null   object 
 7   kernel_details   9987 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 624.3+ KB


In [None]:
sf2["kernel_details"] = sf2["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf2['timestamp'] = sf2['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf2['host_name'] = sf2['host_name'].replace('mendax', 'synthetic-host')

In [None]:
rf2

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,06:13:02.227912438,3.750000e-06,mendax,syscall_exit_accept,2,elasticsearchm,"pid = 11822, tid = 11859","ret = 576, upeer_sockaddr = 246916502706648, u..."
1,06:13:02.227914646,2.208000e-06,mendax,syscall_entry_fcntl,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 576, cmd = 3, arg = 50"
2,06:13:02.227914980,3.340000e-07,mendax,syscall_exit_fcntl,2,elasticsearchm,"pid = 11822, tid = 11859","ret = 2, arg = 50"
3,06:13:02.227916313,1.333000e-06,mendax,syscall_entry_getsockname,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 576, usockaddr_len = 246916502706368"
4,06:13:02.227916771,4.580000e-07,mendax,syscall_exit_getsockname,2,elasticsearchm,"pid = 11822, tid = 11859","ret = 0, usockaddr = 246916502706376, usockadd..."
...,...,...,...,...,...,...,...,...
9994,06:13:02.287305351,1.500000e-06,mendax,syscall_exit_newfstatat,3,curl,"pid = 14352, tid = 14352","ret = 0, statbuf = 281474076040576"
9995,06:13:02.287305601,2.500000e-07,mendax,syscall_entry_newfstatat,3,curl,"pid = 14352, tid = 14352","dfd = -100, filename = ""/"", flag = 0"
9996,06:13:02.287306226,6.250000e-07,mendax,syscall_exit_newfstatat,3,curl,"pid = 14352, tid = 14352","ret = 0, statbuf = 281474076040872"
9997,06:13:02.287306768,5.420000e-07,mendax,syscall_entry_openat,3,curl,"pid = 14352, tid = 14352","dfd = -100, filename = ""/etc/nsswitch.conf"", f..."


In [None]:
# Load your CSV file
csv_file_path = "synthetic_linux_logs_100K.csv"
sf3 = pd.read_csv(csv_file_path)

# Convert each row to string and save to a .txt file
output_file_path = "synthetic_linux_logs_100K.txt"
with open(output_file_path, "w") as f:
    for _, row in sf3.iterrows():
        f.write(" ".join(map(str, row.values)) + "\n")

In [None]:
rf3 = parse_log_file("kernel_trace_100K.txt")
sf3 = parse_log_file("synthetic_linux_logs_100K.txt")

In [None]:
sf3["kernel_details"] = sf3["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf3['timestamp'] = sf3['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf3['host_name'] = sf3['host_name'].replace('mendax', 'synthetic-host')

In [None]:
sf3

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,06:13:02.227912111,2.209000e-06,synthetic-host,syscall_entry_unknown,2,elasticsearch,"pid = 11822, tid = 11859","procname = elasticsearch, pid = 11822, tid = 1..."
1,06:13:02.227913188,1.333000e-06,synthetic-host,syscall_entry_getsockname,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 573, usockaddr_len = 246916502706412"
2,06:13:02.227913547,4.042000e-06,synthetic-host,syscall_entry_ioctl,2,elasticsearch,"pid = 11822, tid = 11859","fd = 573, cmd = 2, arg = 1"
3,06:13:02.227913095,1.833000e-06,synthetic-host,syscall_entry_sendmsg,0,gnome-shell,"pid = 3154, tid = 3154","fd = 45, msg = 281474090966848, flags = 16448"
4,06:13:02.227915105,2.125000e-06,synthetic-host,syscall_entry_fcntl,2,elasticsearch,"pid = 11822, tid = 11859","fd = 573, cmd = 3, arg = 50"
...,...,...,...,...,...,...,...,...
99993,06:13:02.227908920,6.584000e-06,synthetic-host,syscall_entry_rt_sigtimedwait,1,curl,"pid = 14351, tid = 14351","uts = 0, sigsetsize = 8"
99994,06:13:02.227915088,9.708000e-06,synthetic-host,syscall_exit_futex,1,elasticsearchm,"pid = 11822, tid = 13656","procname = elasticsearch[m, pid = 11822, tid =..."
99995,06:13:02.227921970,5.830000e-07,synthetic-host,syscall_entry_fcntl,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 573, cmd = 3, arg = 50"
99996,06:13:02.227912528,2.417000e-06,synthetic-host,syscall_entry_fcntl,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 573, cmd = 3, arg = 50"


In [None]:
sf1_CTGAN = pd.read_csv("synthetic_linux_logs_1K_CTGAN.csv")
sf2_CTGAN = pd.read_csv("synthetic_linux_logs_10K_CTGAN.csv")
sf3_CTGAN = pd.read_csv("synthetic_linux_logs_100K_CTGAN.csv")

In [None]:
sf1_CTGAN["kernel_details"] = sf1_CTGAN["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf1_CTGAN['timestamp'] = sf1_CTGAN['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf1_CTGAN['host_name'] = sf1_CTGAN['host_name'].replace('synthetic_host', 'synthetic-host')

sf2_CTGAN["kernel_details"] = sf2_CTGAN["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf2_CTGAN['timestamp'] = sf2_CTGAN['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf2_CTGAN['host_name'] = sf2_CTGAN['host_name'].replace('synthetic_host', 'synthetic-host')

sf3_CTGAN["kernel_details"] = sf3_CTGAN["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf3_CTGAN['timestamp'] = sf3_CTGAN['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf3_CTGAN['host_name'] = sf3_CTGAN['host_name'].replace('synthetic_host', 'synthetic-host')

In [None]:
# Drop missing and align columns
common_cols = rf1.columns.intersection(sf1_CTGAN.columns)
rf1 = rf1[common_cols].dropna()
sf1_CTGAN = sf1_CTGAN[common_cols].dropna()

# Drop missing and align columns
common_cols = rf1.columns.intersection(sf2_CTGAN.columns)
rf1 = rf1[common_cols].dropna()
sf2_CTGAN = sf2_CTGAN[common_cols].dropna()

# Drop missing and align columns
common_cols = rf1.columns.intersection(sf3_CTGAN.columns)
rf1 = rf1[common_cols].dropna()
sf3_CTGAN = sf3_CTGAN[common_cols].dropna()

In [None]:
sf2_CTGAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   timestamp        10000 non-null  object 
 1   time_elapsed     10000 non-null  float64
 2   host_name        10000 non-null  object 
 3   event_name       10000 non-null  object 
 4   cpu_id           10000 non-null  int64  
 5   procname         10000 non-null  object 
 6   process_details  10000 non-null  object 
 7   kernel_details   10000 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 625.1+ KB


In [None]:
sf1_LSTM = pd.read_csv("synthetic_linux_logs_1K_LSTM.csv")
sf2_LSTM = pd.read_csv("synthetic_linux_logs_10K_LSTM.csv")
sf3_LSTM = pd.read_csv("synthetic_linux_logs_100K_LSTM.csv")

In [None]:
sf1_LSTM["kernel_details"] = sf1_LSTM["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf1_LSTM['timestamp'] = sf1_LSTM['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf1_LSTM['host_name'] = sf1_LSTM['host_name'].replace('mendax', 'synthetic-host')

sf2_LSTM["kernel_details"] = sf2_LSTM["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf2_LSTM['timestamp'] = sf2_LSTM['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf2_LSTM['host_name'] = sf2_LSTM['host_name'].replace('mendax', 'synthetic-host')

sf3_LSTM["kernel_details"] = sf3_LSTM["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf3_LSTM['timestamp'] = sf3_LSTM['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf3_LSTM['host_name'] = sf3_LSTM['host_name'].replace('mendax', 'synthetic-host')

In [None]:
sf3_LSTM

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,06:13:09.321372956,5.840000e-07,synthetic-host,syscall_exit_futex,3.000000,node,"pid = 12061, tid = 12068","ret = 0, uaddr = 1151640084, uaddr2 = 0"
1,06:13:09.321373331,3.750000e-07,synthetic-host,syscall_entry_futex,3.000000,node,"pid = 12061, tid = 12068","uaddr = 1151640000, op = 129, val = 1, utime =..."
2,06:13:09.321373456,1.250000e-07,synthetic-host,syscall_exit_futex,3.000000,node,"pid = 12061, tid = 12068","ret = 0, uaddr = 1151640000, uaddr2 = 0"
3,06:13:09.321374414,9.580000e-07,synthetic-host,syscall_entry_futex,3.000000,node,"pid = 12061, tid = 12068","uaddr = 1151640080, op = 393, val = 4311, utim..."
4,06:13:09.321379372,4.958000e-06,synthetic-host,syscall_exit_futex,0.000000,node,"pid = 12061, tid = 12061","ret = 2, uaddr = 1151736432, uaddr2 = 74"
...,...,...,...,...,...,...,...,...
100030,06:13:08.860598343,-8.346371e-02,synthetic-host,syscall_exit_faccessat,2.885166,ibus-engine-sim,"pid = 3154, tid = 3214","ret = 0, statbuf = 281474473867424"
100031,06:13:08.860598343,-8.346371e-02,synthetic-host,syscall_exit_faccessat,2.885166,ibus-engine-sim,"pid = 3154, tid = 3214","ret = 0, statbuf = 281474473867424"
100032,06:13:08.860598343,-8.346371e-02,synthetic-host,syscall_exit_faccessat,2.885166,ibus-engine-sim,"pid = 3154, tid = 3214","ret = 0, statbuf = 281474473867424"
100033,06:13:08.860598343,-8.346371e-02,synthetic-host,syscall_exit_faccessat,2.885166,ibus-engine-sim,"pid = 3154, tid = 3214","ret = 0, statbuf = 281474473867424"


In [None]:
sf1_OpenAI = pd.read_csv("synthetic_linux_logs_1K_OpenAI.csv")
sf2_OpenAI = pd.read_csv("synthetic_linux_logs_10K_OpenAI.csv")

In [None]:
sf1_OpenAI['process_details'] = sf1_OpenAI['process_details'].astype(str)
sf1_OpenAI['procname'] = sf1_OpenAI['process_details'].str.extract(r'procname\s*=\s*([^,]+)')
sf1_OpenAI['process_details'] = sf1_OpenAI['process_details'].str.replace(r'procname\s*=\s*[^,]+,?\s*', '', regex=True).str.strip(', ')
sf1_OpenAI['host_name'] = sf1_OpenAI['host_name'].replace('mendax', 'synthetic-host')

sf2_OpenAI['process_details'] = sf2_OpenAI['process_details'].astype(str)
sf2_OpenAI['procname'] = sf2_OpenAI['process_details'].str.extract(r'procname\s*=\s*([^,]+)')
sf2_OpenAI['process_details'] = sf2_OpenAI['process_details'].str.replace(r'procname\s*=\s*[^,]+,?\s*', '', regex=True).str.strip(', ')
sf2_OpenAI['host_name'] = sf2_OpenAI['host_name'].replace('mendax', 'synthetic-host')

In [None]:
sf1_OpenAI["kernel_details"] = sf1_OpenAI["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf1_OpenAI['timestamp'] = sf1_OpenAI['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf1_OpenAI['time_elapsed'] = sf1_OpenAI['time_elapsed'].str.replace(r'[^0-9.]', '', regex=True)
sf1_OpenAI = sf1_OpenAI[sf1_OpenAI['time_elapsed'] != '.']

sf2_OpenAI["kernel_details"] = sf2_OpenAI["kernel_details"].astype(str).str.replace('"', '', regex=False)
sf2_OpenAI['timestamp'] = sf2_OpenAI['timestamp'].astype(str).str.replace("'", "").str.replace('"', '').str.strip()
sf2_OpenAI['time_elapsed'] = sf2_OpenAI['time_elapsed'].str.replace(r'[^0-9.]', '', regex=True)
sf2_OpenAI = sf2_OpenAI[sf2_OpenAI['time_elapsed'] != '.']

In [None]:
sf1_OpenAI['time_elapsed'] = sf1_OpenAI['time_elapsed'].astype(float)
sf2_OpenAI['time_elapsed'] = sf2_OpenAI['time_elapsed'].astype(float)

In [None]:
sf3

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,06:13:02.227912111,2.209000e-06,synthetic-host,syscall_entry_unknown,2,elasticsearch,"pid = 11822, tid = 11859","procname = elasticsearch, pid = 11822, tid = 1..."
1,06:13:02.227913188,1.333000e-06,synthetic-host,syscall_entry_getsockname,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 573, usockaddr_len = 246916502706412"
2,06:13:02.227913547,4.042000e-06,synthetic-host,syscall_entry_ioctl,2,elasticsearch,"pid = 11822, tid = 11859","fd = 573, cmd = 2, arg = 1"
3,06:13:02.227913095,1.833000e-06,synthetic-host,syscall_entry_sendmsg,0,gnome-shell,"pid = 3154, tid = 3154","fd = 45, msg = 281474090966848, flags = 16448"
4,06:13:02.227915105,2.125000e-06,synthetic-host,syscall_entry_fcntl,2,elasticsearch,"pid = 11822, tid = 11859","fd = 573, cmd = 3, arg = 50"
...,...,...,...,...,...,...,...,...
99993,06:13:02.227908920,6.584000e-06,synthetic-host,syscall_entry_rt_sigtimedwait,1,curl,"pid = 14351, tid = 14351","uts = 0, sigsetsize = 8"
99994,06:13:02.227915088,9.708000e-06,synthetic-host,syscall_exit_futex,1,elasticsearchm,"pid = 11822, tid = 13656","procname = elasticsearch[m, pid = 11822, tid =..."
99995,06:13:02.227921970,5.830000e-07,synthetic-host,syscall_entry_fcntl,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 573, cmd = 3, arg = 50"
99996,06:13:02.227912528,2.417000e-06,synthetic-host,syscall_entry_fcntl,2,elasticsearchm,"pid = 11822, tid = 11859","fd = 573, cmd = 3, arg = 50"


In [None]:
sf2_OpenAI

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,process_details,kernel_details,procname
0,06:13:02.227912438,3.750000e-06,synthetic-host,syscall_exit_accept,2,"pid = 11822, tid = 11859","ret = 576, upeer_sockaddr = 246916502706648, u...",elasticsearch[m
1,06:13:02.229904385,1.125000e-06,synthetic-host,syscall_entry_timerfd_settime,1,"pid = 2932, tid = 2972","ufd = 67, flags = 1, utmr = 281458146600112",data-loop.0
2,06:13:02.236461585,1.250000e-06,synthetic-host,syscall_exit_prlimit64,3,"pid = 14346, tid = 14346","ret = 0, old_rlim = 281474187421944",sleep
3,06:13:02.228922411,7.080000e-07,synthetic-host,syscall_entry_ioctl,3,"pid = 3154, tid = 3213","fd = 13, cmd = 3222299827, arg = 248272618772584",llvmpipe-0
5,06:13:02.227912438,3.750000e-06,synthetic-host,syscall_exit_accept,2,"pid = 11822, tid = 11859","ret = 576, upeer_sockaddr = 246916502706648, u...",elasticsearch[m
...,...,...,...,...,...,...,...,...
9700,06:13:02.242957703,4.875000e-06,synthetic-host,syscall_entry_socket,1,"pid=14348,tid=14348","family=1,type=526337,protocol=0",curl
9701,06:13:02.242962369,4.666000e-06,synthetic-host,syscall_exit_socket,1,"pid=14348,tid=14348",ret=3,curl
9702,06:13:02.242962911,5.420000e-07,synthetic-host,syscall_entry_connect,1,"pid=14348,tid=14348","fd=3,uservaddr=281473988939624,addrlen=110",curl
9703,06:13:02.243022243,4.200000e-08,synthetic-host,syscall_entry_eventfd2,1,"pid=14348,tid=14348","count=0,flags=526336",curl


In [None]:
sf1_gpt_4 = pd.read_csv("synthetic_linux_logs_2.5K_GPT-4.1.csv")
sf2_gpt_4 = pd.read_csv("synthetic_linux_logs_10K_GPT-4.1.csv")

In [None]:
sf1_gpt_4 = sf1_gpt_4[:1000]

In [None]:
sf1_gpt_4.loc[:, 'time_elapsed'] = sf1_gpt_4['time_elapsed'].str.replace(r'\(\+\+', '', regex=True)
sf1_gpt_4.loc[:, 'time_elapsed'] = sf1_gpt_4['time_elapsed'].str.replace(r'\)', '', regex=True)

In [None]:
# Split out 'procname' using regex
sf1_gpt_4['procname'] = sf1_gpt_4['process_details'].str.extract(r'procname\s*=\s*([^,]+)')

# Keeping only the 'pid = ..., tid = ...' in process_details
sf1_gpt_4['process_details'] = sf1_gpt_4['process_details'].str.replace(r'procname\s*=\s*[^,]+,\s*', '', regex=True)

In [None]:
sf1_gpt_4

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,process_details,kernel_details,procname
0,06:13:02.227946563,0.000008417,synthetic-host,syscall_entry_read,3,"pid = 12542, tid = 12543","fd = 34, buf = 314159265358979, count = 1024",nginx
1,06:13:02.227954980,0.000008417,synthetic-host,syscall_exit_read,3,"pid = 12542, tid = 12543",ret = 512,nginx
2,06:13:02.227970000,0.000006603,synthetic-host,syscall_entry_write,1,"pid = 13456, tid = 13457","fd = 12, buf = 987654321012345, count = 512",redis
3,06:13:02.228000000,0.000009234,synthetic-host,syscall_entry_connect,0,"pid = 14012, tid = 14017","fd = 45, sockaddr = 123456789012345, addrlen = 16",apache2
4,06:13:02.228015000,0.000015000,synthetic-host,syscall_exit_connect,0,"pid = 14012, tid = 14017",ret = 0,apache2
...,...,...,...,...,...,...,...,...
995,12:45:33.987654321,0.000008290,synthetic-host,syscall_entry_write,1,"pid = 18901, tid = 18902","fd = 61, buf = 271828182846390, count = 1024",nginx
996,13:22:44.123456789,0.000008295,synthetic-host,syscall_entry_write,1,"pid = 19234, tid = 19235","fd = 59, buf = 271828182846375, count = 1024",nginx
997,10:20:15.345678912,0.000008310,synthetic-host,syscall_entry_write,1,"pid = 15012, tid = 15013","fd = 55, buf = 271828182846390, count = 1024",nginx
998,10:18:54.123456789,0.000008265,synthetic-host,syscall_entry_write,1,"pid = 15890, tid = 15891","fd = 62, buf = 271828182846395, count = 1024",nginx


In [None]:
sf2_gpt_4.loc[:, 'time_elapsed'] = sf2_gpt_4['time_elapsed'].str.replace(r'\(\+\+', '', regex=True)
sf2_gpt_4.loc[:, 'time_elapsed'] = sf2_gpt_4['time_elapsed'].str.replace(r'\)', '', regex=True)

In [None]:
# Split out 'procname' using regex
sf2_gpt_4['procname'] = sf2_gpt_4['process_details'].str.extract(r'procname\s*=\s*([^,]+)')

# Keeping only the 'pid = ..., tid = ...' in process_details
sf2_gpt_4['process_details'] = sf2_gpt_4['process_details'].str.replace(r'procname\s*=\s*[^,]+,\s*', '', regex=True)

In [None]:
sf2_gpt_4

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,process_details,kernel_details,procname
0,06:14:15.123456789,0.000005678,synthetic-host,syscall_entry_read,1,"pid = 12542, tid = 12543","fd = 34, buf = 123456789012345, count = 1024",nginx
1,06:15:45.789012345,0.000004321,synthetic-host,syscall_entry_open,0,"pid = 13201, tid = 13202","filename = /etc/ssh/sshd_config, flags = 0x0, ...",sshd
2,06:22:18.342789123,0.000007412,synthetic-host,syscall_entry_read,1,"pid = 13876, tid = 13877","fd = 45, buf = 987654321098765, count = 512",nginx
3,06:25:17.456789123,0.000006789,synthetic-host,syscall_entry_write,3,"pid = 14567, tid = 14568","fd = 40, buf = 987654321012345, count = 512",nginx
4,06:30:12.123456789,0.000005678,synthetic-host,syscall_entry_read,1,"pid = 13579, tid = 13580","fd = 27, buf = 987654321012345, count = 2048",nginx
...,...,...,...,...,...,...,...,...
9995,09:47:15.789654321,0.000008435,synthetic-host,syscall_entry_read,2,"pid = 56789, tid = 56790","fd = 38, buf = 876543210987654, count = 1024",nginx
9996,10:45:33.789456123,0.000008438,synthetic-host,syscall_entry_read,2,"pid = 28765, tid = 28766","fd = 37, buf = 234567890123456, count = 1024",nginx
9997,07:21:45.123456789,0.000008410,synthetic-host,syscall_entry_openat,0,"pid = 65432, tid = 65432","dfd = 255, filename = /bin/ls, flags = 0, mode...",bash
9998,06:13:02.227946563,0.000008417,synthetic-host,syscall_entry_read,3,"pid = 12542, tid = 12543","fd = 34, buf = 314159265358979, count = 1024",nginx


In [None]:
sf1_gpt_o4 = pd.read_csv("synthetic_linux_logs_1K_o4-mini.csv")
sf2_gpt_o4 = pd.read_csv("synthetic_linux_logs_10K_o4-mini.csv")

In [None]:
sf1_gpt_o4['timestamp'] = sf1_gpt_o4['timestamp'].str.lstrip("'")
sf2_gpt_o4['timestamp'] = sf2_gpt_o4['timestamp'].str.lstrip("'")

sf1_gpt_o4['time_elapsed'] = sf1_gpt_o4['time_elapsed'].str.lstrip("'")
sf2_gpt_o4['time_elapsed'] = sf2_gpt_o4['time_elapsed'].str.lstrip("'")

In [None]:
sf1_gpt_o4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   timestamp        1000 non-null   object
 1   time_elapsed     1000 non-null   object
 2   host_name        1000 non-null   object
 3   event_name       1000 non-null   object
 4   cpu_id           1000 non-null   int64 
 5   procname         1000 non-null   object
 6   process_details  1000 non-null   object
 7   kernel_details   982 non-null    object
dtypes: int64(1), object(7)
memory usage: 62.6+ KB


In [None]:
sf2_gpt_o4

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,12:34:56.100001234,0.000001234,synthetic-host,syscall_entry_open,3,nginx,"pid = 1234, tid = 1234","filename = /etc/nginx/nginx.conf, flags = 5242..."
1,12:34:56.100002500,0.000001266,synthetic-host,syscall_exit_open,3,nginx,"pid = 1234, tid = 1234",ret = 3
2,12:34:56.100010000,0.000007500,synthetic-host,syscall_entry_read,3,nginx,"pid = 1234, tid = 1234","fd = 3, buf = 129003452, count = 512"
3,12:34:56.100011111,0.000001111,synthetic-host,syscall_exit_read,3,nginx,"pid = 1234, tid = 1234","ret = 512, buf = 129003452"
4,12:34:56.100020000,0.000008889,synthetic-host,syscall_entry_close,3,nginx,"pid = 1234, tid = 1234",fd = 3
...,...,...,...,...,...,...,...,...
9995,12:00:00.410000000,0.000000500,synthetic-host,syscall_exit_connect,1,wget,"pid = 5100, tid = 5103",ret = 0
9996,12:00:00.420000000,0.000000500,synthetic-host,syscall_entry_accept,2,bash,"pid = 5200, tid = 5200",fd = 7
9997,12:00:00.430000000,0.000000500,synthetic-host,syscall_exit_accept,3,python3,"pid = 5300, tid = 5301","ret = 8, upeer_sockaddr = 281474976711044, upe..."
9998,12:00:00.440000000,0.000000500,synthetic-host,syscall_entry_futex,4,java,"pid = 5400, tid = 5402","uaddr = 140737488352000, op = 0, val = 2, utim..."


In [None]:
sf1_gpt_4.loc[:, 'time_elapsed'] = sf1_gpt_4['time_elapsed'].str.replace('++', '', regex=False)
sf2_gpt_4.loc[:, 'time_elapsed'] = sf2_gpt_4['time_elapsed'].str.replace('++', '', regex=False)

In [None]:
sf1_gpt_4['time_elapsed'] = sf1_gpt_4['time_elapsed'].astype(float)
sf2_gpt_4['time_elapsed'] = sf2_gpt_4['time_elapsed'].astype(float)

In [None]:
sf1_gpt_o4['time_elapsed'] = sf1_gpt_o4['time_elapsed'].astype(float)
sf2_gpt_o4['time_elapsed'] = sf2_gpt_o4['time_elapsed'].astype(float)

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# ------------------ Dataset Setup ------------------ #

cat_cols = ['host_name', 'event_name', 'procname', 'process_details', 'kernel_details']
rsf1_dataset = pd.concat([rf1, sf1, sf1_CTGAN, sf1_LSTM, sf1_OpenAI, sf1_gpt_4, sf1_gpt_o4])
rsf2_dataset = pd.concat([rf2, sf2, sf2_CTGAN, sf2_LSTM, sf2_OpenAI, sf2_gpt_4, sf2_gpt_o4])
rsf3_dataset = pd.concat([rf3, sf3, sf3_CTGAN, sf3_LSTM])

In [None]:
rsf2_dataset

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,06:13:02.227912438,3.750000e-06,mendax,syscall_exit_accept,2.0,elasticsearchm,"pid = 11822, tid = 11859","ret = 576, upeer_sockaddr = 246916502706648, u..."
1,06:13:02.227914646,2.208000e-06,mendax,syscall_entry_fcntl,2.0,elasticsearchm,"pid = 11822, tid = 11859","fd = 576, cmd = 3, arg = 50"
2,06:13:02.227914980,3.340000e-07,mendax,syscall_exit_fcntl,2.0,elasticsearchm,"pid = 11822, tid = 11859","ret = 2, arg = 50"
3,06:13:02.227916313,1.333000e-06,mendax,syscall_entry_getsockname,2.0,elasticsearchm,"pid = 11822, tid = 11859","fd = 576, usockaddr_len = 246916502706368"
4,06:13:02.227916771,4.580000e-07,mendax,syscall_exit_getsockname,2.0,elasticsearchm,"pid = 11822, tid = 11859","ret = 0, usockaddr = 246916502706376, usockadd..."
...,...,...,...,...,...,...,...,...
9995,12:00:00.410000000,5.000000e-07,synthetic-host,syscall_exit_connect,1.0,wget,"pid = 5100, tid = 5103",ret = 0
9996,12:00:00.420000000,5.000000e-07,synthetic-host,syscall_entry_accept,2.0,bash,"pid = 5200, tid = 5200",fd = 7
9997,12:00:00.430000000,5.000000e-07,synthetic-host,syscall_exit_accept,3.0,python3,"pid = 5300, tid = 5301","ret = 8, upeer_sockaddr = 281474976711044, upe..."
9998,12:00:00.440000000,5.000000e-07,synthetic-host,syscall_entry_futex,4.0,java,"pid = 5400, tid = 5402","uaddr = 140737488352000, op = 0, val = 2, utim..."


In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import copy

# ------------------ Column Setup ------------------ #
cat_cols = ['host_name', 'event_name', 'procname', 'process_details', 'kernel_details']
num_cols = ['timestamp', 'time_elapsed']

# ------------------ Normalization Function ------------------ #
def normalize_column(df, col):
    return df[col].astype(str).str.replace('"', '', regex=False)\
                              .str.replace("'", '', regex=False)\
                              .str.strip().str.lower()

def normalize_dataframe(df, columns):
    for col in columns:
        df[col] = normalize_column(df, col)

def convert_timestamp(df):
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
    return df.dropna(subset=[timestamp_col])

def convert_time_columns(df):
    if df['timestamp'].dtype == object:
        df['timestamp'] = df['timestamp'].astype(str).str.slice(0, 15)
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S.%f', errors='coerce')
        df['timestamp'] = (
            df['timestamp'].dt.hour * 3600 +
            df['timestamp'].dt.minute * 60 +
            df['timestamp'].dt.second +
            df['timestamp'].dt.microsecond / 1e6
        )

    if df['time_elapsed'].dtype == object or df['time_elapsed'].dtype == 'str':
        df['time_elapsed'] = pd.to_timedelta(df['time_elapsed'], errors='coerce').dt.total_seconds()

    return df.dropna(subset=['timestamp', 'time_elapsed'])

# ------------------ Combine and Deep Copy ------------------ #
rsf_datasets = {
    "rsf1": pd.concat([rf1, sf1, sf1_CTGAN, sf1_LSTM, sf1_OpenAI, sf1_gpt_4, sf1_gpt_o4], ignore_index=True),
    "rsf2": pd.concat([rf2, sf2, sf2_CTGAN, sf2_LSTM, sf2_OpenAI, sf2_gpt_4, sf2_gpt_o4], ignore_index=True),
    "rsf3": pd.concat([rf3, sf3, sf3_CTGAN, sf3_LSTM], ignore_index=True)
}
rsf_datasets = {k: copy.deepcopy(v) for k, v in rsf_datasets.items()}

# ------------------ Normalize, Encode, Scale & Save ------------------ #
for name in rsf_datasets:
    df = rsf_datasets[name]

    # Normalize categorical + convert timestamps
    normalize_dataframe(df, cat_cols)
    df = convert_timestamp(df)

    # Encode categorical
    label_encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Save encoders and scaler
    with open(f'label_encoders_{name}.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)

    print(f"✅ Encoders and scaler saved for {name}")

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


✅ Encoders and scaler saved for rsf1


  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


✅ Encoders and scaler saved for rsf2


  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


✅ Encoders and scaler saved for rsf3


In [None]:
rsf_datasets['rsf1']['time_elapsed'] = rsf_datasets['rsf1']['time_elapsed'].round(6)
rsf_datasets['rsf2']['time_elapsed'] = rsf_datasets['rsf2']['time_elapsed'].round(6)
rsf_datasets['rsf3']['time_elapsed'] = rsf_datasets['rsf3']['time_elapsed'].round(6)

In [None]:
rsf_datasets

{'rsf1':          timestamp  time_elapsed       host_name  \
 0     1.752387e+09      0.000004          mendax   
 1     1.752387e+09      0.000002          mendax   
 2     1.752387e+09      0.000000          mendax   
 3     1.752387e+09      0.000001          mendax   
 4     1.752387e+09      0.000000          mendax   
 ...            ...           ...             ...   
 6970  1.752414e+09      0.000001  synthetic-host   
 6971  1.752414e+09      0.000001  synthetic-host   
 6972  1.752414e+09      0.000001  synthetic-host   
 6973  1.752414e+09      0.000001  synthetic-host   
 6974  1.752414e+09      0.000001  synthetic-host   
 
                          event_name  cpu_id        procname  \
 0               syscall_exit_accept     2.0  elasticsearchm   
 1               syscall_entry_fcntl     2.0  elasticsearchm   
 2                syscall_exit_fcntl     2.0  elasticsearchm   
 3         syscall_entry_getsockname     2.0  elasticsearchm   
 4          syscall_exit_getsockna

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest
from scipy.stats import wasserstein_distance

cat_cols = ['host_name', 'event_name', 'procname', 'process_details', 'kernel_details']
timestamp_col = 'timestamp'

def normalize_column(series):
    return series.astype(str).str.replace('"', '', regex=False)\
                             .str.replace("'", '', regex=False)\
                             .str.strip().str.lower()

def convert_timestamp(df):
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
    return df.dropna(subset=[timestamp_col])

def time_to_seconds(t):
    h, m, s = t.split(":")
    sec, nano = s.split(".")
    return int(h) * 3600 + int(m) * 60 + int(sec) + int(nano) / 1e9

def safe_transform(le, series):
    known_classes = set(le.classes_)
    mapped = series.apply(lambda x: x if x in known_classes else '__unknown__')
    if '__unknown__' not in le.classes_:
        new_classes = np.append(le.classes_, '__unknown__')
        le.classes_ = np.unique(new_classes)
    return le.transform(mapped)

def range_completeness(real, synth):
    min_r, max_r = real.min(), real.max()
    min_s, max_s = synth.min(), synth.max()
    intersection = max(0, min(max_r, max_s) - max(min_r, min_s))
    return intersection / (max_r - min_r) if max_r != min_r else 1.0

def category_completeness(real, synth):
    return len(set(real) & set(synth)) / len(set(real)) if len(set(real)) else 0

def run_fidelity(real_df, synthetic_df, encoder_path, scaler_path, title):
    print(f"\n===== Fidelity Analysis: {title} =====")

    with open(encoder_path, 'rb') as f:
        label_encoders = pickle.load(f)

    for col in cat_cols:
        le = label_encoders[col]
        real_df[col] = safe_transform(le, normalize_column(real_df[col]))
        synthetic_df[col] = safe_transform(le, normalize_column(synthetic_df[col]))

    # real_df['timestamp'] = real_df['timestamp'].apply(time_to_seconds)
    # synthetic_df['timestamp'] = synthetic_df['timestamp'].apply(time_to_seconds)

    real_df = convert_timestamp(real_df)
    synthetic_df = convert_timestamp(synthetic_df)

    print("\n--- Basic Statistical Comparison ---")
    print("Real:\n", real_df.describe())
    print("Synthetic:\n", synthetic_df.describe())

    print("\n--- Wasserstein Distances ---")
    for col in real_df.columns:
        if col in synthetic_df.columns and np.issubdtype(real_df[col].dtype, np.number):
            dist = wasserstein_distance(real_df[col], synthetic_df[col])
            print(f"{col}: {dist:.6f}")

    print("\n--- Category & Range Completeness ---")
    for col in real_df.columns:
        if col in synthetic_df.columns:
            if np.issubdtype(real_df[col].dtype, np.number):
                rc = range_completeness(real_df[col], synthetic_df[col])
                print(f"Range Completeness for {col}: {rc:.2f}")
            else:
                cc = category_completeness(real_df[col], synthetic_df[col])
                print(f"Category Completeness for {col}: {cc:.2f}")

    print("\n--- Utility Test (Domain Classifier) ---")
    real_df["label"] = 1
    synthetic_df["label"] = 0
    combined_df = pd.concat([real_df, synthetic_df])
    X = combined_df.drop("label", axis=1)
    y = combined_df["label"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
    clf = LogisticRegression(max_iter=1000, random_state=42).fit(X_train, y_train)
    acc = accuracy_score(y_test, clf.predict(X_test))
    print(f"Domain Classification Accuracy: {acc:.4f}")

    print("\n--- Privacy Risk Test ---")
    features = X.columns
    nbrs = NearestNeighbors(n_neighbors=1).fit(real_df[features])
    distances, _ = nbrs.kneighbors(synthetic_df[features])
    print(f"Minimum Distance to Real: {distances.min():.6f}")
    print(f"Average Distance to Real: {distances.mean():.6f}")

    print("\n--- Anomaly Detection ---")
    iso = IsolationForest(contamination=0.05, random_state=42)
    iso.fit(real_df[features])
    scores = iso.decision_function(synthetic_df[features])
    anomalies = (scores < -0.1).mean() * 100
    print(f"Anomalies in Synthetic Data: {anomalies:.2f}%")

    print(f"===== End of {title} =====\n\n")

# ------------------ Run for All rsf Variants ------------------ #

# Group 1
run_fidelity(rf1.copy(), sf1.copy(), "label_encoders_rsf1.pkl", "scaler_rsf1.pkl", "rf1 vs sf1")
run_fidelity(rf1.copy(), sf1_CTGAN.copy(), "label_encoders_rsf1.pkl", "scaler_rsf1.pkl", "rf1 vs sf1_CTGAN")
run_fidelity(rf1.copy(), sf1_LSTM.copy(), "label_encoders_rsf1.pkl", "scaler_rsf1.pkl", "rf1 vs sf1_LSTM")
run_fidelity(rf1.copy(), sf1_OpenAI.copy(), "label_encoders_rsf1.pkl", "scaler_rsf1.pkl", "rf1 vs sf1_OpenAI")
run_fidelity(rf1.copy(), sf1_gpt_4.copy(), "label_encoders_rsf1.pkl", "scaler_rsf1.pkl", "rf1 vs sf1_GPT_4.1-mini")
run_fidelity(rf1.copy(), sf1_gpt_o4.copy(), "label_encoders_rsf1.pkl", "scaler_rsf1.pkl", "rf1 vs sf1_GPT_o4-mini")

# Group 2
run_fidelity(rf2.copy(), sf2.copy(), "label_encoders_rsf2.pkl", "scaler_rsf2.pkl", "rf2 vs sf2")
run_fidelity(rf2.copy(), sf2_CTGAN.copy(), "label_encoders_rsf2.pkl", "scaler_rsf2.pkl", "rf2 vs sf2_CTGAN")
run_fidelity(rf2.copy(), sf2_LSTM.copy(), "label_encoders_rsf2.pkl", "scaler_rsf2.pkl", "rf2 vs sf2_LSTM")
run_fidelity(rf2.copy(), sf2_OpenAI.copy(), "label_encoders_rsf2.pkl", "scaler_rsf2.pkl", "rf2 vs sf2_OpenAI")
run_fidelity(rf2.copy(), sf2_gpt_4.copy(), "label_encoders_rsf2.pkl", "scaler_rsf2.pkl", "rf2 vs sf2_GPT_4.1-mini")
run_fidelity(rf2.copy(), sf2_gpt_o4.copy(), "label_encoders_rsf2.pkl", "scaler_rsf2.pkl", "rf2 vs sf2_GPT_o4-mini")

# Group 3
run_fidelity(rf3.copy(), sf3.copy(), "label_encoders_rsf3.pkl", "scaler_rsf3.pkl", "rf3 vs sf3")
run_fidelity(rf3.copy(), sf3_CTGAN.copy(), "label_encoders_rsf3.pkl", "scaler_rsf3.pkl", "rf3 vs sf3_CTGAN")
run_fidelity(rf3.copy(), sf3_LSTM.copy(), "label_encoders_rsf3.pkl", "scaler_rsf3.pkl", "rf3 vs sf3_LSTM")


===== Fidelity Analysis: rf1 vs sf1 =====

--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name  event_name      cpu_id  \
count  9.990000e+02  9.990000e+02      999.0  999.000000  999.000000   
mean   1.752387e+09  4.382683e-06        1.0  294.264264    0.645646   
std    1.320660e-03  2.468539e-05        0.0  156.569077    1.104989   
min    1.752387e+09  0.000000e+00        1.0   13.000000    0.000000   
25%    1.752387e+09  2.080000e-07        1.0  107.000000    0.000000   
50%    1.752387e+09  3.330000e-07        1.0  373.000000    0.000000   
75%    1.752387e+09  1.104500e-06        1.0  418.000000    1.000000   
max    1.752387e+09  5.419020e-04        1.0  487.000000    3.000000   

         procname  process_details  kernel_details  
count  999.000000       999.000000      999.000000  
mean   108.572573       318.725726     1328.974975  
std     40.351691       125.230196      670.418132  
min      8.000000        95.000000       52.000000 

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


Anomalies in Synthetic Data: 0.43%
===== End of rf1 vs sf1 =====



===== Fidelity Analysis: rf1 vs sf1_CTGAN =====

--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name  event_name      cpu_id  \
count  9.990000e+02  9.990000e+02      999.0  999.000000  999.000000   
mean   1.752387e+09  4.382683e-06        1.0  294.264264    0.645646   
std    1.320660e-03  2.468539e-05        0.0  156.569077    1.104989   
min    1.752387e+09  0.000000e+00        1.0   13.000000    0.000000   
25%    1.752387e+09  2.080000e-07        1.0  107.000000    0.000000   
50%    1.752387e+09  3.330000e-07        1.0  373.000000    0.000000   
75%    1.752387e+09  1.104500e-06        1.0  418.000000    1.000000   
max    1.752387e+09  5.419020e-04        1.0  487.000000    3.000000   

         procname  process_details  kernel_details  
count  999.000000       999.000000      999.000000  
mean   108.572573       318.725726     1328.974975  
std     40.351691       125.230

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


Anomalies in Synthetic Data: 0.30%
===== End of rf1 vs sf1_CTGAN =====



===== Fidelity Analysis: rf1 vs sf1_LSTM =====

--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name  event_name      cpu_id  \
count  9.990000e+02  9.990000e+02      999.0  999.000000  999.000000   
mean   1.752387e+09  4.382683e-06        1.0  294.264264    0.645646   
std    1.320660e-03  2.468539e-05        0.0  156.569077    1.104989   
min    1.752387e+09  0.000000e+00        1.0   13.000000    0.000000   
25%    1.752387e+09  2.080000e-07        1.0  107.000000    0.000000   
50%    1.752387e+09  3.330000e-07        1.0  373.000000    0.000000   
75%    1.752387e+09  1.104500e-06        1.0  418.000000    1.000000   
max    1.752387e+09  5.419020e-04        1.0  487.000000    3.000000   

         procname  process_details  kernel_details  
count  999.000000       999.000000      999.000000  
mean   108.572573       318.725726     1328.974975  
std     40.351691       12

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


Anomalies in Synthetic Data: 0.00%
===== End of rf1 vs sf1_LSTM =====



===== Fidelity Analysis: rf1 vs sf1_OpenAI =====

--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name  event_name      cpu_id  \
count  9.990000e+02  9.990000e+02      999.0  999.000000  999.000000   
mean   1.752387e+09  4.382683e-06        1.0  294.264264    0.645646   
std    1.320660e-03  2.468539e-05        0.0  156.569077    1.104989   
min    1.752387e+09  0.000000e+00        1.0   13.000000    0.000000   
25%    1.752387e+09  2.080000e-07        1.0  107.000000    0.000000   
50%    1.752387e+09  3.330000e-07        1.0  373.000000    0.000000   
75%    1.752387e+09  1.104500e-06        1.0  418.000000    1.000000   
max    1.752387e+09  5.419020e-04        1.0  487.000000    3.000000   

         procname  process_details  kernel_details  
count  999.000000       999.000000      999.000000  
mean   108.572573       318.725726     1328.974975  
std     40.351691       1

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


          timestamp  time_elapsed  host_name   event_name  cpu_id  \
count  1.000000e+03  1.000000e+03     1000.0  1000.000000  1000.0   
mean   1.752387e+09  2.441600e-06        2.0   249.600000     1.0   
std    4.095481e-05  2.019155e-06        0.0   183.317881     0.0   
min    1.752387e+09  4.200000e-08        2.0    23.000000     1.0   
25%    1.752387e+09  5.420000e-07        2.0    43.000000     1.0   
50%    1.752387e+09  2.083000e-06        2.0   321.000000     1.0   
75%    1.752387e+09  4.666000e-06        2.0   390.000000     1.0   
max    1.752387e+09  4.875000e-06        2.0   471.000000     1.0   

       process_details  kernel_details  procname  
count      1000.000000     1000.000000    1000.0  
mean       1350.195000     1185.579000      23.0  
std         349.952473      904.816521       0.0  
min         329.000000      124.000000      23.0  
25%        1470.000000      203.000000      23.0  
50%        1470.000000     1331.000000      23.0  
75%        1470.00000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


Anomalies in Synthetic Data: 0.10%
===== End of rf1 vs sf1_GPT_4.1-mini =====



===== Fidelity Analysis: rf1 vs sf1_GPT_o4-mini =====

--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name  event_name      cpu_id  \
count  9.990000e+02  9.990000e+02      999.0  999.000000  999.000000   
mean   1.752387e+09  4.382683e-06        1.0  294.264264    0.645646   
std    1.320660e-03  2.468539e-05        0.0  156.569077    1.104989   
min    1.752387e+09  0.000000e+00        1.0   13.000000    0.000000   
25%    1.752387e+09  2.080000e-07        1.0  107.000000    0.000000   
50%    1.752387e+09  3.330000e-07        1.0  373.000000    0.000000   
75%    1.752387e+09  1.104500e-06        1.0  418.000000    1.000000   
max    1.752387e+09  5.419020e-04        1.0  487.000000    3.000000   

         procname  process_details  kernel_details  
count  999.000000       999.000000      999.000000  
mean   108.572573       318.725726     1328.974975  
std     40.3

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9


Anomalies in Synthetic Data: 0.00%
===== End of rf1 vs sf1_GPT_o4-mini =====



===== Fidelity Analysis: rf2 vs sf2 =====


  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name   event_name       cpu_id  \
count  9.999000e+03  9.999000e+03     9999.0  9999.000000  9999.000000   
mean   1.752387e+09  5.940469e-06        1.0   292.304830     0.967897   
std    1.833356e-02  5.493544e-05        0.0   169.266571     1.075593   
min    1.752387e+09  0.000000e+00        1.0     2.000000     0.000000   
25%    1.752387e+09  2.080000e-07        1.0   130.000000     0.000000   
50%    1.752387e+09  5.410000e-07        1.0   307.000000     1.000000   
75%    1.752387e+09  1.708000e-06        1.0   443.000000     2.000000   
max    1.752387e+09  2.517933e-03        1.0   589.000000     3.000000   

          procname  process_details  kernel_details  
count  9999.000000      9999.000000     9999.000000  
mean     99.310631       840.968397     9508.023302  
std      61.171774       444.573246     5221.525970  
min      12.000000       267.000000       13.000000  
25%      63.000000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name   event_name       cpu_id  \
count  9.999000e+03  9.999000e+03     9999.0  9999.000000  9999.000000   
mean   1.752387e+09  5.940469e-06        1.0   292.304830     0.967897   
std    1.833356e-02  5.493544e-05        0.0   169.266571     1.075593   
min    1.752387e+09  0.000000e+00        1.0     2.000000     0.000000   
25%    1.752387e+09  2.080000e-07        1.0   130.000000     0.000000   
50%    1.752387e+09  5.410000e-07        1.0   307.000000     1.000000   
75%    1.752387e+09  1.708000e-06        1.0   443.000000     2.000000   
max    1.752387e+09  2.517933e-03        1.0   589.000000     3.000000   

          procname  process_details  kernel_details  
count  9999.000000      9999.000000     9999.000000  
mean     99.310631       840.968397     9508.023302  
std      61.171774       444.573246     5221.525970  
min      12.000000       267.000000       13.000000  
25%      63.000000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name   event_name       cpu_id  \
count  9.999000e+03  9.999000e+03     9999.0  9999.000000  9999.000000   
mean   1.752387e+09  5.940469e-06        1.0   292.304830     0.967897   
std    1.833356e-02  5.493544e-05        0.0   169.266571     1.075593   
min    1.752387e+09  0.000000e+00        1.0     2.000000     0.000000   
25%    1.752387e+09  2.080000e-07        1.0   130.000000     0.000000   
50%    1.752387e+09  5.410000e-07        1.0   307.000000     1.000000   
75%    1.752387e+09  1.708000e-06        1.0   443.000000     2.000000   
max    1.752387e+09  2.517933e-03        1.0   589.000000     3.000000   

          procname  process_details  kernel_details  
count  9999.000000      9999.000000     9999.000000  
mean     99.310631       840.968397     9508.023302  
std      61.171774       444.573246     5221.525970  
min      12.000000       267.000000       13.000000  
25%      63.000000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name   event_name       cpu_id  \
count  9.999000e+03  9.999000e+03     9999.0  9999.000000  9999.000000   
mean   1.752387e+09  5.940469e-06        1.0   292.304830     0.967897   
std    1.833356e-02  5.493544e-05        0.0   169.266571     1.075593   
min    1.752387e+09  0.000000e+00        1.0     2.000000     0.000000   
25%    1.752387e+09  2.080000e-07        1.0   130.000000     0.000000   
50%    1.752387e+09  5.410000e-07        1.0   307.000000     1.000000   
75%    1.752387e+09  1.708000e-06        1.0   443.000000     2.000000   
max    1.752387e+09  2.517933e-03        1.0   589.000000     3.000000   

          procname  process_details  kernel_details  
count  9999.000000      9999.000000     9999.000000  
mean     99.310631       840.968397     9508.023302  
std      61.171774       444.573246     5221.525970  
min      12.000000       267.000000       13.000000  
25%      63.000000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name   event_name       cpu_id  \
count  9.999000e+03  9.999000e+03     9999.0  9999.000000  9999.000000   
mean   1.752387e+09  5.940469e-06        1.0   292.304830     0.967897   
std    1.833356e-02  5.493544e-05        0.0   169.266571     1.075593   
min    1.752387e+09  0.000000e+00        1.0     2.000000     0.000000   
25%    1.752387e+09  2.080000e-07        1.0   130.000000     0.000000   
50%    1.752387e+09  5.410000e-07        1.0   307.000000     1.000000   
75%    1.752387e+09  1.708000e-06        1.0   443.000000     2.000000   
max    1.752387e+09  2.517933e-03        1.0   589.000000     3.000000   

          procname  process_details  kernel_details  
count  9999.000000      9999.000000     9999.000000  
mean     99.310631       840.968397     9508.023302  
std      61.171774       444.573246     5221.525970  
min      12.000000       267.000000       13.000000  
25%      63.000000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name   event_name       cpu_id  \
count  9.999000e+03  9.999000e+03     9999.0  9999.000000  9999.000000   
mean   1.752387e+09  5.940469e-06        1.0   292.304830     0.967897   
std    1.833356e-02  5.493544e-05        0.0   169.266571     1.075593   
min    1.752387e+09  0.000000e+00        1.0     2.000000     0.000000   
25%    1.752387e+09  2.080000e-07        1.0   130.000000     0.000000   
50%    1.752387e+09  5.410000e-07        1.0   307.000000     1.000000   
75%    1.752387e+09  1.708000e-06        1.0   443.000000     2.000000   
max    1.752387e+09  2.517933e-03        1.0   589.000000     3.000000   

          procname  process_details  kernel_details  
count  9999.000000      9999.000000     9999.000000  
mean     99.310631       840.968397     9508.023302  
std      61.171774       444.573246     5221.525970  
min      12.000000       267.000000       13.000000  
25%      63.000000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name    event_name        cpu_id  \
count  9.999900e+04  9.999900e+04    99999.0  99999.000000  99999.000000   
mean   1.752387e+09  7.094670e-05        1.0     75.367344      1.455955   
std    2.515240e+00  6.499002e-04        0.0     44.189452      1.134885   
min    1.752387e+09  0.000000e+00        1.0      1.000000      0.000000   
25%    1.752387e+09  1.660000e-07        1.0     34.000000      0.000000   
50%    1.752387e+09  4.160000e-07        1.0     75.000000      1.000000   
75%    1.752387e+09  1.291000e-06        1.0    109.000000      3.000000   
max    1.752387e+09  2.487054e-02        1.0    159.000000      3.000000   

           procname  process_details  kernel_details  
count  99999.000000     99999.000000    99999.000000  
mean      37.947939       277.174742    18231.572006  
std       17.391438        74.407200     6474.721498  
min        1.000000         2.000000        2.0000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name    event_name        cpu_id  \
count  9.999900e+04  9.999900e+04    99999.0  99999.000000  99999.000000   
mean   1.752387e+09  7.094670e-05        1.0     75.367344      1.455955   
std    2.515240e+00  6.499002e-04        0.0     44.189452      1.134885   
min    1.752387e+09  0.000000e+00        1.0      1.000000      0.000000   
25%    1.752387e+09  1.660000e-07        1.0     34.000000      0.000000   
50%    1.752387e+09  4.160000e-07        1.0     75.000000      1.000000   
75%    1.752387e+09  1.291000e-06        1.0    109.000000      3.000000   
max    1.752387e+09  2.487054e-02        1.0    159.000000      3.000000   

           procname  process_details  kernel_details  
count  99999.000000     99999.000000    99999.000000  
mean      37.947939       277.174742    18231.572006  
std       17.391438        74.407200     6474.721498  
min        1.000000         2.000000        2.0000

  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9
  df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce').astype(np.int64) / 1e9



--- Basic Statistical Comparison ---
Real:
           timestamp  time_elapsed  host_name    event_name        cpu_id  \
count  9.999900e+04  9.999900e+04    99999.0  99999.000000  99999.000000   
mean   1.752387e+09  7.094670e-05        1.0     75.367344      1.455955   
std    2.515240e+00  6.499002e-04        0.0     44.189452      1.134885   
min    1.752387e+09  0.000000e+00        1.0      1.000000      0.000000   
25%    1.752387e+09  1.660000e-07        1.0     34.000000      0.000000   
50%    1.752387e+09  4.160000e-07        1.0     75.000000      1.000000   
75%    1.752387e+09  1.291000e-06        1.0    109.000000      3.000000   
max    1.752387e+09  2.487054e-02        1.0    159.000000      3.000000   

           procname  process_details  kernel_details  
count  99999.000000     99999.000000    99999.000000  
mean      37.947939       277.174742    18231.572006  
std       17.391438        74.407200     6474.721498  
min        1.000000         2.000000        2.0000