In [None]:
import re

def parse_log_line(line: str) -> dict:
    # Extract timestamp
    timestamp_match = re.search(r"\[(.*?)\]", line)
    timestamp = timestamp_match.group(1) if timestamp_match else ""

    # Extract time elapsed
    time_elapsed_match = re.search(r"\((.*?)\)", line)
    time_elapsed = time_elapsed_match.group(1) if time_elapsed_match else ""

    # Extract host name and event name
    parts = line.split()
    host_name = parts[2] if len(parts) > 2 else ""
    event_name = parts[3].rstrip(":") if len(parts) > 3 else ""

    # Extract all {...} blocks
    curly_groups = re.findall(r"\{(.*?)\}", line)

    # Initialize fields
    cpu_id = "0"
    process_details = ""
    kernel_details = ""

    # Extract CPU ID from first group (if present)
    if len(curly_groups) >= 1:
        cpu_match = re.search(r"cpu_id\s*=\s*(\d+)", curly_groups[0])
        cpu_id = cpu_match.group(1) if cpu_match else "0"

    # Clean and assign process/kernel details (remove quotes)
    if len(curly_groups) >= 2:
        process_details = curly_groups[1].replace('"', '').replace("'", "").strip()
    if len(curly_groups) >= 3:
        kernel_details = curly_groups[2].replace('"', '').replace("'", "").strip()

    # Build formatted example string and escape all curly braces for PromptTemplate
    formatted = (
        f"timestamp:{timestamp}, time elapsed: (+{time_elapsed}), Host Name: {host_name}, "
        f"Event Name: {event_name}, Cpu Id: {cpu_id}, process_details: {process_details}, "
        f"kernel_details: {kernel_details}"
    )

    # Escape all braces for safe PromptTemplate use
    escaped_formatted = formatted.replace("{", "{{").replace("}", "}}")

    return {"example": escaped_formatted}

In [None]:
logs = []
with open("kernel_trace_1K.txt", "r") as file:
    for line in file:
        if line.strip():
            try:
                parsed = parse_log_line(line.strip())
                logs.append(parsed)
            except Exception as e:
                print(f"Error parsing line: {line}\n{e}")

In [None]:
def extract_unique_event_examples(examples):
    seen_events = set()
    unique_examples = []

    for item in examples:
        example_text = item.get("example", "")

        # Try to extract the Event Name
        try:
            event_name = example_text.split("Event Name:")[1].split(",")[0].strip()
        except IndexError:
            continue  # Skip if malformed

        if event_name not in seen_events:
            seen_events.add(event_name)
            unique_examples.append(item)

    return unique_examples

In [None]:
filtered_examples = extract_unique_event_examples(logs)

# Print unique event names and count
print(f"Total unique events: {len(filtered_examples)}")
for ex in filtered_examples:
    print(ex)

Total unique events: 48
{'example': 'timestamp:06:13:02.227908688, time elapsed: (++?.?????????), Host Name: mendax, Event Name: syscall_entry_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 553, upeer_addrlen = 246916502706640'}
{'example': 'timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640'}
{'example': 'timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50'}
{'example': 'timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_detail

In [None]:
def escape_curly_braces(examples):
    return [{"example": ex["example"].replace("{", "").replace("}", "}}")} for ex in examples]

filtered_examples = escape_curly_braces(filtered_examples)

In [None]:
import re

def remove_high_risk_examples(examples):
    safe_examples = []
    for ex in examples:
        example_text = ex["example"]

        # Remove examples with problematic patterns
        if any([
            re.search(r"\{.*?\}", example_text),               # Unescaped curly braces
            re.search(r"\[\s*\d+\s*\]\s*=", example_text),     # Patterns like [0] =
            "fds =" in example_text,                           # Explicit fds structures
            "events =" in example_text and "POLL" in example_text  # POLLIN/POLLOUT structures
        ]):
            continue  # Skip high-risk example
        safe_examples.append(ex)

    return safe_examples

In [None]:
filtered_examples = remove_high_risk_examples(filtered_examples)
print(f"Remaining safe examples: {len(filtered_examples)}")

Remaining safe examples: 45


In [None]:
filtered_examples = filtered_examples[1:]

In [None]:
filtered_examples = [
    row for row in filtered_examples
    if not row["example"].strip().endswith("kernel_details:")
]

In [None]:
log_lines = [ex['example'] for ex in filtered_examples]
for line in log_lines:
    print(line)

timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640
timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50
timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 2, arg = 50
timestamp:06:13:02.227916313, time elapsed: (++0.000001333), Host Name: mendax, Event Name: syscall_entry_getsockname, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, usockaddr_len = 24

In [None]:
# Examples extracted from 1K Trace Sample datasets
example_logs = """Examples:
timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640
timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50
timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 2, arg = 50
timestamp:06:13:02.227916313, time elapsed: (++0.000001333), Host Name: mendax, Event Name: syscall_entry_getsockname, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, usockaddr_len = 246916502706368
timestamp:06:13:02.227916771, time elapsed: (++0.000000458), Host Name: mendax, Event Name: syscall_exit_getsockname, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 0, usockaddr = 246916502706376, usockaddr_len = 246916502706368
timestamp:06:13:02.227927646, time elapsed: (++0.000003292), Host Name: mendax, Event Name: syscall_entry_setsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, level = 6, optname = 1, optval = 246916502705868, optlen = 4
timestamp:06:13:02.227928063, time elapsed: (++0.000000417), Host Name: mendax, Event Name: syscall_exit_setsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 0
timestamp:06:13:02.227928771, time elapsed: (++0.000000708), Host Name: mendax, Event Name: syscall_entry_getsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, level = 1, optname = 7, optlen = 246916502705996
timestamp:06:13:02.227929146, time elapsed: (++0.000000375), Host Name: mendax, Event Name: syscall_exit_getsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 0, optval = 246916502706080, optlen = 246916502705996
timestamp:06:13:02.227938146, time elapsed: (++0.000002417), Host Name: mendax, Event Name: syscall_entry_write, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 549, buf = 246916502706248, count = 8
timestamp:06:13:02.227940521, time elapsed: (++0.000002375), Host Name: mendax, Event Name: syscall_exit_write, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 8
timestamp:06:13:02.227944020, time elapsed: (++0.000003499), Host Name: mendax, Event Name: syscall_entry_epoll_pwait, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: epfd = 542, maxevents = 1024, timeout = -1, sigmask = 0x0, sigsetsize = 8
timestamp:06:13:02.227946729, time elapsed: (++0.000002709), Host Name: mendax, Event Name: syscall_exit_connect, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = -115
timestamp:06:13:02.227949604, time elapsed: (++0.000001542), Host Name: mendax, Event Name: syscall_entry_rt_sigaction, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: sig = 13, act = 281474913773656, sigsetsize = 8
timestamp:06:13:02.227949979, time elapsed: (++0.000000375), Host Name: mendax, Event Name: syscall_exit_rt_sigaction, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 0, oact = 0
timestamp:06:13:02.227959145, time elapsed: (++0.000003125), Host Name: mendax, Event Name: syscall_entry_futex, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: uaddr = 267960194247760, op = 129, val = 2147483647, utime = 0, uaddr2 = 203053665022808, val3 = 496169800
timestamp:06:13:02.227961395, time elapsed: (++0.000002250), Host Name: mendax, Event Name: syscall_exit_futex, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 0, uaddr = 267960194247760, uaddr2 = 203053665022808
timestamp:06:13:02.228005269, time elapsed: (++0.000001375), Host Name: mendax, Event Name: syscall_exit_getpid, Cpu Id: 3, process_details: procname = curl, pid = 14344, tid = 14344, kernel_details: ret = 14344
timestamp:06:13:02.228012394, time elapsed: (++0.000007125), Host Name: mendax, Event Name: syscall_entry_brk, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: brk = 203053665202176
timestamp:06:13:02.228013435, time elapsed: (++0.000001041), Host Name: mendax, Event Name: syscall_exit_brk, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 203053665202176
timestamp:06:13:02.228181389, time elapsed: (++0.000006833), Host Name: mendax, Event Name: syscall_entry_getrandom, Cpu Id: 3, process_details: procname = curl, pid = 14344, tid = 14344, kernel_details: buf = 222501009261744, count = 48, flags = 0
timestamp:06:13:02.228182306, time elapsed: (++0.000000917), Host Name: mendax, Event Name: syscall_exit_getrandom, Cpu Id: 3, process_details: procname = curl, pid = 14344, tid = 14344, kernel_details: ret = 48, buf = 222501009261744, count = 48, flags = 0
timestamp:06:13:02.228694417, time elapsed: (++0.000019583), Host Name: mendax, Event Name: syscall_entry_munmap, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: addr = 248270898528256, len = 7602176
timestamp:06:13:02.228704875, time elapsed: (++0.000010458), Host Name: mendax, Event Name: syscall_exit_munmap, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: ret = 0
timestamp:06:13:02.228708583, time elapsed: (++0.000003708), Host Name: mendax, Event Name: syscall_entry_sendto, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: fd = 4, buff = 203053665469200, len = 517, flags = 16384, addr = 0, addr_len = 0
timestamp:06:13:02.228715167, time elapsed: (++0.000006584), Host Name: mendax, Event Name: syscall_exit_sendto, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 517
timestamp:06:13:02.228717458, time elapsed: (++0.000002291), Host Name: mendax, Event Name: syscall_entry_recvfrom, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: fd = 4, size = 5, flags = 0, addr_len = 0
timestamp:06:13:02.228718250, time elapsed: (++0.000000792), Host Name: mendax, Event Name: syscall_exit_recvfrom, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = -11, ubuf = 203053665491603, addr = 0, addr_len = 0
timestamp:06:13:02.228729249, time elapsed: (++0.000002291), Host Name: mendax, Event Name: syscall_entry_read, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: fd = 547, count = 16
timestamp:06:13:02.228730083, time elapsed: (++0.000000834), Host Name: mendax, Event Name: syscall_exit_read, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: ret = 8, buf = 246916494319000
timestamp:06:13:02.228911745, time elapsed: (++0.000001292), Host Name: mendax, Event Name: syscall_entry_epoll_ctl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: epfd = 546, op_enum = ( EPOLL_CTL_ADD : container = 1 ), fd = 573, event =  data_union =  u64 = 0x70000023D, fd = 573
timestamp:06:13:02.228913536, time elapsed: (++0.000000916), Host Name: mendax, Event Name: syscall_exit_epoll_ctl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: ret = 0
timestamp:06:13:02.228922411, time elapsed: (++0.000000708), Host Name: mendax, Event Name: syscall_entry_ioctl, Cpu Id: 3, process_details: procname = llvmpipe-0, pid = 3154, tid = 3213, kernel_details: fd = 13, cmd = 3222299827, arg = 248272618772584
timestamp:06:13:02.228925369, time elapsed: (++0.000000000), Host Name: mendax, Event Name: syscall_exit_ioctl, Cpu Id: 3, process_details: procname = llvmpipe-0, pid = 3154, tid = 3213, kernel_details: ret = 0, arg = 248272618772584
timestamp:06:13:02.229356066, time elapsed: (++0.000191495), Host Name: mendax, Event Name: syscall_entry_close, Cpu Id: 0, process_details: procname = lttng-runas, pid = 14324, tid = 14324, kernel_details: fd = 4
timestamp:06:13:02.229357316, time elapsed: (++0.000001250), Host Name: mendax, Event Name: syscall_exit_close, Cpu Id: 0, process_details: procname = lttng-runas, pid = 14324, tid = 14324, kernel_details: ret = 0
timestamp:06:13:02.229359066, time elapsed: (++0.000001750), Host Name: mendax, Event Name: syscall_entry_recvmsg, Cpu Id: 0, process_details: procname = lttng-runas, pid = 14324, tid = 14324, kernel_details: fd = 6, msg = 281474329849072, flags = 16384
timestamp:06:13:02.229393523, time elapsed: (++0.000000833), Host Name: mendax, Event Name: syscall_entry_sync_file_range, Cpu Id: 0, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: fd = 35, offset = 0, nbytes = 4096, flags = 2
timestamp:06:13:02.229429022, time elapsed: (++0.000035499), Host Name: mendax, Event Name: syscall_exit_sync_file_range, Cpu Id: 0, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: ret = 0
timestamp:06:13:02.229865552, time elapsed: (++0.000001458), Host Name: mendax, Event Name: syscall_entry_fadvise64_64, Cpu Id: 3, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: fd = 35, offset = 0, len = 4096, advice = 4
timestamp:06:13:02.229868636, time elapsed: (++0.000003084), Host Name: mendax, Event Name: syscall_exit_fadvise64_64, Cpu Id: 3, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: ret = 0, fd = 35, offset = 0, len = 4096, advice = 4
timestamp:06:13:02.229904385, time elapsed: (++0.000001125), Host Name: mendax, Event Name: syscall_entry_timerfd_settime, Cpu Id: 1, process_details: procname = data-loop.0, pid = 2932, tid = 2972, kernel_details: ufd = 67, flags = 1, utmr = 281458146600112
timestamp:06:13:02.229905385, time elapsed: (++0.000001000), Host Name: mendax, Event Name: syscall_exit_timerfd_settime, Cpu Id: 1, process_details: procname = data-loop.0, pid = 2932, tid = 2972, kernel_details: ret = 0, otmr = 0
"""

In [None]:
# Examples extracted from 10K Trace Sample datasets
example_logs = """Examples:
timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640
timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50
timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 2, arg = 50
timestamp:06:13:02.227916313, time elapsed: (++0.000001333), Host Name: mendax, Event Name: syscall_entry_getsockname, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, usockaddr_len = 246916502706368
timestamp:06:13:02.227916771, time elapsed: (++0.000000458), Host Name: mendax, Event Name: syscall_exit_getsockname, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 0, usockaddr = 246916502706376, usockaddr_len = 246916502706368
timestamp:06:13:02.227927646, time elapsed: (++0.000003292), Host Name: mendax, Event Name: syscall_entry_setsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, level = 6, optname = 1, optval = 246916502705868, optlen = 4
timestamp:06:13:02.227928063, time elapsed: (++0.000000417), Host Name: mendax, Event Name: syscall_exit_setsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 0
timestamp:06:13:02.227928771, time elapsed: (++0.000000708), Host Name: mendax, Event Name: syscall_entry_getsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, level = 1, optname = 7, optlen = 246916502705996
timestamp:06:13:02.227929146, time elapsed: (++0.000000375), Host Name: mendax, Event Name: syscall_exit_getsockopt, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 0, optval = 246916502706080, optlen = 246916502705996
timestamp:06:13:02.227938146, time elapsed: (++0.000002417), Host Name: mendax, Event Name: syscall_entry_write, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 549, buf = 246916502706248, count = 8
timestamp:06:13:02.227940521, time elapsed: (++0.000002375), Host Name: mendax, Event Name: syscall_exit_write, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 8
timestamp:06:13:02.227944020, time elapsed: (++0.000003499), Host Name: mendax, Event Name: syscall_entry_epoll_pwait, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: epfd = 542, maxevents = 1024, timeout = -1, sigmask = 0x0, sigsetsize = 8
timestamp:06:13:02.227946729, time elapsed: (++0.000002709), Host Name: mendax, Event Name: syscall_exit_connect, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = -115
timestamp:06:13:02.227949604, time elapsed: (++0.000001542), Host Name: mendax, Event Name: syscall_entry_rt_sigaction, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: sig = 13, act = 281474913773656, sigsetsize = 8
timestamp:06:13:02.227949979, time elapsed: (++0.000000375), Host Name: mendax, Event Name: syscall_exit_rt_sigaction, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 0, oact = 0
timestamp:06:13:02.227959145, time elapsed: (++0.000003125), Host Name: mendax, Event Name: syscall_entry_futex, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: uaddr = 267960194247760, op = 129, val = 2147483647, utime = 0, uaddr2 = 203053665022808, val3 = 496169800
timestamp:06:13:02.227961395, time elapsed: (++0.000002250), Host Name: mendax, Event Name: syscall_exit_futex, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 0, uaddr = 267960194247760, uaddr2 = 203053665022808
timestamp:06:13:02.228005269, time elapsed: (++0.000001375), Host Name: mendax, Event Name: syscall_exit_getpid, Cpu Id: 3, process_details: procname = curl, pid = 14344, tid = 14344, kernel_details: ret = 14344
timestamp:06:13:02.228012394, time elapsed: (++0.000007125), Host Name: mendax, Event Name: syscall_entry_brk, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: brk = 203053665202176
timestamp:06:13:02.228013435, time elapsed: (++0.000001041), Host Name: mendax, Event Name: syscall_exit_brk, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 203053665202176
timestamp:06:13:02.228181389, time elapsed: (++0.000006833), Host Name: mendax, Event Name: syscall_entry_getrandom, Cpu Id: 3, process_details: procname = curl, pid = 14344, tid = 14344, kernel_details: buf = 222501009261744, count = 48, flags = 0
timestamp:06:13:02.228182306, time elapsed: (++0.000000917), Host Name: mendax, Event Name: syscall_exit_getrandom, Cpu Id: 3, process_details: procname = curl, pid = 14344, tid = 14344, kernel_details: ret = 48, buf = 222501009261744, count = 48, flags = 0
timestamp:06:13:02.228694417, time elapsed: (++0.000019583), Host Name: mendax, Event Name: syscall_entry_munmap, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: addr = 248270898528256, len = 7602176
timestamp:06:13:02.228704875, time elapsed: (++0.000010458), Host Name: mendax, Event Name: syscall_exit_munmap, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: ret = 0
timestamp:06:13:02.228708583, time elapsed: (++0.000003708), Host Name: mendax, Event Name: syscall_entry_sendto, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: fd = 4, buff = 203053665469200, len = 517, flags = 16384, addr = 0, addr_len = 0
timestamp:06:13:02.228715167, time elapsed: (++0.000006584), Host Name: mendax, Event Name: syscall_exit_sendto, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = 517
timestamp:06:13:02.228717458, time elapsed: (++0.000002291), Host Name: mendax, Event Name: syscall_entry_recvfrom, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: fd = 4, size = 5, flags = 0, addr_len = 0
timestamp:06:13:02.228718250, time elapsed: (++0.000000792), Host Name: mendax, Event Name: syscall_exit_recvfrom, Cpu Id: 2, process_details: procname = curl, pid = 14345, tid = 14345, kernel_details: ret = -11, ubuf = 203053665491603, addr = 0, addr_len = 0
timestamp:06:13:02.228729249, time elapsed: (++0.000002291), Host Name: mendax, Event Name: syscall_entry_read, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: fd = 547, count = 16
timestamp:06:13:02.228730083, time elapsed: (++0.000000834), Host Name: mendax, Event Name: syscall_exit_read, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: ret = 8, buf = 246916494319000
timestamp:06:13:02.228911745, time elapsed: (++0.000001292), Host Name: mendax, Event Name: syscall_entry_epoll_ctl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: epfd = 546, op_enum = ( EPOLL_CTL_ADD : container = 1 ), fd = 573, event =  data_union =  u64 = 0x70000023D, fd = 573
timestamp:06:13:02.228913536, time elapsed: (++0.000000916), Host Name: mendax, Event Name: syscall_exit_epoll_ctl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11867, kernel_details: ret = 0
timestamp:06:13:02.228922411, time elapsed: (++0.000000708), Host Name: mendax, Event Name: syscall_entry_ioctl, Cpu Id: 3, process_details: procname = llvmpipe-0, pid = 3154, tid = 3213, kernel_details: fd = 13, cmd = 3222299827, arg = 248272618772584
timestamp:06:13:02.228925369, time elapsed: (++0.000000000), Host Name: mendax, Event Name: syscall_exit_ioctl, Cpu Id: 3, process_details: procname = llvmpipe-0, pid = 3154, tid = 3213, kernel_details: ret = 0, arg = 248272618772584
timestamp:06:13:02.229356066, time elapsed: (++0.000191495), Host Name: mendax, Event Name: syscall_entry_close, Cpu Id: 0, process_details: procname = lttng-runas, pid = 14324, tid = 14324, kernel_details: fd = 4
timestamp:06:13:02.229357316, time elapsed: (++0.000001250), Host Name: mendax, Event Name: syscall_exit_close, Cpu Id: 0, process_details: procname = lttng-runas, pid = 14324, tid = 14324, kernel_details: ret = 0
timestamp:06:13:02.229359066, time elapsed: (++0.000001750), Host Name: mendax, Event Name: syscall_entry_recvmsg, Cpu Id: 0, process_details: procname = lttng-runas, pid = 14324, tid = 14324, kernel_details: fd = 6, msg = 281474329849072, flags = 16384
timestamp:06:13:02.229393523, time elapsed: (++0.000000833), Host Name: mendax, Event Name: syscall_entry_sync_file_range, Cpu Id: 0, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: fd = 35, offset = 0, nbytes = 4096, flags = 2
timestamp:06:13:02.229429022, time elapsed: (++0.000035499), Host Name: mendax, Event Name: syscall_exit_sync_file_range, Cpu Id: 0, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: ret = 0
timestamp:06:13:02.229865552, time elapsed: (++0.000001458), Host Name: mendax, Event Name: syscall_entry_fadvise64_64, Cpu Id: 3, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: fd = 35, offset = 0, len = 4096, advice = 4
timestamp:06:13:02.229868636, time elapsed: (++0.000003084), Host Name: mendax, Event Name: syscall_exit_fadvise64_64, Cpu Id: 3, process_details: procname = lttng-consumerd, pid = 14323, tid = 14330, kernel_details: ret = 0, fd = 35, offset = 0, len = 4096, advice = 4
timestamp:06:13:02.229904385, time elapsed: (++0.000001125), Host Name: mendax, Event Name: syscall_entry_timerfd_settime, Cpu Id: 1, process_details: procname = data-loop.0, pid = 2932, tid = 2972, kernel_details: ufd = 67, flags = 1, utmr = 281458146600112
timestamp:06:13:02.229905385, time elapsed: (++0.000001000), Host Name: mendax, Event Name: syscall_exit_timerfd_settime, Cpu Id: 1, process_details: procname = data-loop.0, pid = 2932, tid = 2972, kernel_details: ret = 0, otmr = 0
timestamp:06:13:02.232522065, time elapsed: (++0.000001083), Host Name: mendax, Event Name: syscall_exit_recvmsg, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: ret = -11, msg = 281474090966720
timestamp:06:13:02.232526315, time elapsed: (++0.000004250), Host Name: mendax, Event Name: syscall_entry_sendmsg, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: fd = 45, msg = 281474090966848, flags = 16448
timestamp:06:13:02.232542856, time elapsed: (++0.000016541), Host Name: mendax, Event Name: syscall_exit_sendmsg, Cpu Id: 3, process_details: procname = gnome-shell, pid = 3154, tid = 3154, kernel_details: ret = 24
timestamp:06:13:02.235858809, time elapsed: (++0.000003625), Host Name: mendax, Event Name: syscall_entry_shutdown, Cpu Id: 0, process_details: procname = lttng, pid = 14341, tid = 14341, kernel_details: fd = 3, how = 2
timestamp:06:13:02.235860726, time elapsed: (++0.000001917), Host Name: mendax, Event Name: syscall_exit_shutdown, Cpu Id: 0, process_details: procname = lttng, pid = 14341, tid = 14341, kernel_details: ret = 0
timestamp:06:13:02.235868850, time elapsed: (++0.000003708), Host Name: mendax, Event Name: syscall_entry_newfstat, Cpu Id: 0, process_details: procname = lttng, pid = 14341, tid = 14341, kernel_details: fd = 1
timestamp:06:13:02.235870600, time elapsed: (++0.000001750), Host Name: mendax, Event Name: syscall_exit_newfstat, Cpu Id: 0, process_details: procname = lttng, pid = 14341, tid = 14341, kernel_details: ret = 0, statbuf = 281474453361816
timestamp:06:13:02.235909308, time elapsed: (++0.000001042), Host Name: mendax, Event Name: syscall_entry_rt_sigprocmask, Cpu Id: 3, process_details: procname = sudo, pid = 14314, tid = 14314, kernel_details: how = 0, nset = 200919500199056, sigsetsize = 8
timestamp:06:13:02.235910141, time elapsed: (++0.000000833), Host Name: mendax, Event Name: syscall_exit_rt_sigprocmask, Cpu Id: 3, process_details: procname = sudo, pid = 14314, tid = 14314, kernel_details: ret = 0, oset = 281474232194616
timestamp:06:13:02.235914599, time elapsed: (++0.000003041), Host Name: mendax, Event Name: syscall_entry_exit_group, Cpu Id: 0, process_details: procname = lttng, pid = 14341, tid = 14341, kernel_details: error_code = 0
timestamp:06:13:02.236028596, time elapsed: (++0.000113997), Host Name: mendax, Event Name: syscall_exit_wait4, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 14341, stat_addr = 281474356452576, ru = 0
timestamp:06:13:02.236062845, time elapsed: (++0.000006958), Host Name: mendax, Event Name: syscall_entry_wait4, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: upid = -1, options = 1
timestamp:06:13:02.236089386, time elapsed: (++0.000016124), Host Name: mendax, Event Name: syscall_entry_newfstatat, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: dfd = -100, filename = ., flag = 0
timestamp:06:13:02.236094011, time elapsed: (++0.000004625), Host Name: mendax, Event Name: syscall_exit_newfstatat, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 0, statbuf = 281474356452384
timestamp:06:13:02.236104094, time elapsed: (++0.000000583), Host Name: mendax, Event Name: syscall_exit_geteuid, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 0
timestamp:06:13:02.236104761, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_getegid, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 0
timestamp:06:13:02.236105719, time elapsed: (++0.000000667), Host Name: mendax, Event Name: syscall_exit_getuid, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 0
timestamp:06:13:02.236106386, time elapsed: (++0.000000250), Host Name: mendax, Event Name: syscall_exit_getgid, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 0
timestamp:06:13:02.236107011, time elapsed: (++0.000000625), Host Name: mendax, Event Name: syscall_entry_faccessat, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: dfd = -100, filename = /usr/bin/sleep, mode = 1
timestamp:06:13:02.236108261, time elapsed: (++0.000001250), Host Name: mendax, Event Name: syscall_exit_faccessat, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 0
timestamp:06:13:02.236113427, time elapsed: (++0.000000541), Host Name: mendax, Event Name: syscall_entry_lseek, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: fd = 255, offset = -528, whence = 1
timestamp:06:13:02.236113886, time elapsed: (++0.000000459), Host Name: mendax, Event Name: syscall_exit_lseek, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 728
timestamp:06:13:02.236114927, time elapsed: (++0.000000416), Host Name: mendax, Event Name: syscall_entry_clone, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: flags = 0x1200011, newsp = 0x0, parent_tid = 0x0, child_tid = 0x0
timestamp:06:13:02.236165717, time elapsed: (++0.000013916), Host Name: mendax, Event Name: syscall_exit_clone, Cpu Id: 0, process_details: procname = bash, pid = 14316, tid = 14316, kernel_details: ret = 14346
timestamp:06:13:02.236179092, time elapsed: (++0.000002500), Host Name: mendax, Event Name: syscall_entry_set_robust_list, Cpu Id: 3, process_details: procname = bash, pid = 14346, tid = 14346, kernel_details: head = 269302286520576, len = 24
timestamp:06:13:02.236179675, time elapsed: (++0.000000583), Host Name: mendax, Event Name: syscall_exit_set_robust_list, Cpu Id: 3, process_details: procname = bash, pid = 14346, tid = 14346, kernel_details: ret = 0
timestamp:06:13:02.236220758, time elapsed: (++0.000005000), Host Name: mendax, Event Name: syscall_entry_execve, Cpu Id: 3, process_details: procname = bash, pid = 14346, tid = 14346, kernel_details: filename = /usr/bin/sleep, argv = 0xB2BBD08663C0, envp = 0xB2BBD0860F30
timestamp:06:13:02.236347546, time elapsed: (++0.000000750), Host Name: mendax, Event Name: syscall_exit_execve, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: ret = 0
timestamp:06:13:02.236369962, time elapsed: (++0.000007583), Host Name: mendax, Event Name: syscall_entry_mmap, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: addr = 0x0, len = 8192, prot = 3, flags = 34, fd = -1, offset = 0
timestamp:06:13:02.236371420, time elapsed: (++0.000001458), Host Name: mendax, Event Name: syscall_exit_mmap, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: ret = 0xFAB8A5FF6000
timestamp:06:13:02.236376879, time elapsed: (++0.000002167), Host Name: mendax, Event Name: syscall_entry_openat, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: dfd = -100, filename = /etc/ld.so.cache, flags = 524288, mode = 0
timestamp:06:13:02.236379212, time elapsed: (++0.000002333), Host Name: mendax, Event Name: syscall_exit_openat, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: ret = 3
timestamp:06:13:02.236400128, time elapsed: (++0.000000375), Host Name: mendax, Event Name: syscall_entry_mprotect, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: start = 275670965395456, len = 77824, prot = 0
timestamp:06:13:02.236405211, time elapsed: (++0.000005083), Host Name: mendax, Event Name: syscall_exit_mprotect, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: ret = 0
timestamp:06:13:02.236420669, time elapsed: (++0.000005750), Host Name: mendax, Event Name: syscall_entry_set_tid_address, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: tidptr = 275670965907440
timestamp:06:13:02.236421252, time elapsed: (++0.000000583), Host Name: mendax, Event Name: syscall_exit_set_tid_address, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: ret = 14346
timestamp:06:13:02.236460335, time elapsed: (++0.000003834), Host Name: mendax, Event Name: syscall_entry_prlimit64, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: pid = 0, resource = 3, new_rlim = 0
timestamp:06:13:02.236461585, time elapsed: (++0.000001250), Host Name: mendax, Event Name: syscall_exit_prlimit64, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: ret = 0, old_rlim = 281474187421944
timestamp:06:13:02.236558540, time elapsed: (++0.000007666), Host Name: mendax, Event Name: syscall_entry_clock_nanosleep, Cpu Id: 3, process_details: procname = sleep, pid = 14346, tid = 14346, kernel_details: which_clock = 0, flags = 0, rqtp = 281474187422616
timestamp:06:13:02.240673139, time elapsed: (++0.000001042), Host Name: mendax, Event Name: syscall_entry_getdents64, Cpu Id: 0, process_details: procname = bash, pid = 14339, tid = 14339, kernel_details: fd = 3, count = 32768
timestamp:06:13:02.240684847, time elapsed: (++0.000011708), Host Name: mendax, Event Name: syscall_exit_getdents64, Cpu Id: 0, process_details: procname = bash, pid = 14339, tid = 14339, kernel_details: ret = 192, dirent = 192476908919456
timestamp:06:13:02.242957703, time elapsed: (++0.000004875), Host Name: mendax, Event Name: syscall_entry_socket, Cpu Id: 1, process_details: procname = curl, pid = 14348, tid = 14348, kernel_details: family = 1, type = 526337, protocol = 0
timestamp:06:13:02.242962369, time elapsed: (++0.000004666), Host Name: mendax, Event Name: syscall_exit_socket, Cpu Id: 1, process_details: procname = curl, pid = 14348, tid = 14348, kernel_details: ret = 3
timestamp:06:13:02.242962911, time elapsed: (++0.000000542), Host Name: mendax, Event Name: syscall_entry_connect, Cpu Id: 1, process_details: procname = curl, pid = 14348, tid = 14348, kernel_details: fd = 3, uservaddr = 281473988939624, addrlen = 110
timestamp:06:13:02.243022243, time elapsed: (++0.000000042), Host Name: mendax, Event Name: syscall_entry_eventfd2, Cpu Id: 1, process_details: procname = curl, pid = 14348, tid = 14348, kernel_details: count = 0, flags = 526336
timestamp:06:13:02.243024326, time elapsed: (++0.000002083), Host Name: mendax, Event Name: syscall_exit_eventfd2, Cpu Id: 1, process_details: procname = curl, pid = 14348, tid = 14348, kernel_details: ret = 3
"""

In [None]:
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def generate_linux_logs(batch_size=50):
    messages = [
        {
            "role": "user",
            "content": f"""
You are a helpful assistant designed to generate realistic Linux LTTng logs in structured format. You will be given a schema and a few examples. Generate synthetic logs using the same structure.

Each log must follow this format:
timestamp:<TIME>, time elapsed: <ELAPSED>, Host Name: <HOST>, Event Name: <EVENT>, Cpu Id: <CPU>, process_details: procname = <PROCNAME>, pid = <PID>, tid = <TID>, kernel_details: <KEY1> = <VAL1>, <KEY2> = <VAL2>, ...

Rules:
- 'timestamp' should be in format HH:MM:SS.NNNNNNNNN
- 'time elapsed' should be formatted like (++0.000000417)
- 'Host Name' must always be: synthetic-host
- 'Event Name' should reflect valid Linux syscalls like syscall_entry_read, syscall_exit_write, etc.
- 'Cpu Id' must be an integer (0–7)
- 'procname' must be a plausible Linux process (e.g., nginx, systemd, sshd, elasticsearch, curl, wget, etc.)
- 'pid' and 'tid' should be plausible integers, usually in range 1000–20000
- 'kernel_details' must include 1 to n number syscall-related fields such as fd, ret, buf, addrlen, size, etc., with realistic related values to the event name and process name

{example_logs}

Generate exactly {batch_size} Linux logs.
Return only the log lines. Do not return explanations, markdown or any formatting block like ```.

Start generating logs now.
"""
        }
    ]

    response = client.chat.completions.create(
        model="o4-mini",
        messages=messages,
    )

    output = response.choices[0].message.content
    return output.strip()


In [None]:
import time

# Batch generation function
def generate_1000_logs():
    total_logs = 1000
    batch_size = 100
    all_logs = []

    for i in range(0, total_logs, batch_size):
        print(f"Generating logs {i + 1} to {i + batch_size}...")
        batch_output = generate_linux_logs(batch_size=batch_size)
        log_lines = batch_output.strip().splitlines()
        all_logs.extend(log_lines)
        time.sleep(1)  # Respect RPM limits

    return all_logs

# Save logs to text file
def save_logs_to_file(logs, filename="synthetic_linux_logs_1K.txt"):
    with open(filename, "w") as f:
        for log in logs:
            f.write(log.strip() + "\n")
    print(f"{len(logs)} logs written to {filename}")

# Run everything
if __name__ == "__main__":
    generated_data = generate_1000_logs()
    save_logs_to_file(generated_data)

Generating logs 1 to 100...
Generating logs 101 to 200...
Generating logs 201 to 300...
Generating logs 301 to 400...
Generating logs 401 to 500...
Generating logs 501 to 600...
Generating logs 601 to 700...
Generating logs 701 to 800...
Generating logs 801 to 900...
Generating logs 901 to 1000...
1000 logs written to synthetic_linux_logs_1K.txt


In [None]:
generated_data

['timestamp:12:00:00.100000000, time elapsed: (++0.000001234), Host Name: synthetic-host, Event Name: syscall_entry_open, Cpu Id: 1, process_details: procname = nginx, pid = 4567, tid = 4567, kernel_details: filename = 140735903668640, flags = 0, mode = 0644',
 'timestamp:12:00:00.100001500, time elapsed: (++0.000000789), Host Name: synthetic-host, Event Name: syscall_exit_open, Cpu Id: 1, process_details: procname = nginx, pid = 4567, tid = 4567, kernel_details: ret = 3',
 'timestamp:12:00:00.100002500, time elapsed: (++0.000000900), Host Name: synthetic-host, Event Name: syscall_entry_read, Cpu Id: 1, process_details: procname = nginx, pid = 4567, tid = 4567, kernel_details: fd = 3, count = 1024',
 'timestamp:12:00:00.100003300, time elapsed: (++0.000000800), Host Name: synthetic-host, Event Name: syscall_exit_read, Cpu Id: 1, process_details: procname = nginx, pid = 4567, tid = 4567, kernel_details: ret = 512, buf = 140735903668704',
 'timestamp:12:00:00.100004100, time elapsed: (++

In [None]:
import pandas as pd
import re

synthetic_data = []
skipped = 0

for i, log in enumerate(generated_data):
    try:
        # Extract fields using tolerant regex
        timestamp = re.search(r'timestamp:(\d{2}:\d{2}:\d{2}\.\d{9})', log)
        time_elapsed = re.search(r'time elapsed:\s*\(\+*([0-9.]+)\)', log)
        host_name = re.search(r'Host Name:\s*([^,]+)', log)
        event_name = re.search(r'Event Name:\s*([^,]+)', log)
        cpu_id = re.search(r'Cpu Id:\s*(\d+)', log)
        procname = re.search(r'procname\s*=\s*([^,]+)', log)
        pid = re.search(r'pid\s*=\s*([^,]+)', log)
        tid = re.search(r'tid\s*=\s*([^,]+)', log)
        kernel_details = re.search(r'kernel_details:\s*(.*)', log)

        # Ensure all required fields are found
        if not all([timestamp, time_elapsed, host_name, event_name, cpu_id, procname, pid, tid]):
            skipped += 1
            continue

        # Append parsed record
        synthetic_data.append({
            'timestamp': timestamp.group(1),
            'time_elapsed': time_elapsed.group(1),
            'host_name': host_name.group(1).strip(),
            'event_name': event_name.group(1).strip(),
            'cpu_id': int(cpu_id.group(1).strip()),
            'procname': procname.group(1).strip(),
            'process_details': f"pid = {pid.group(1).strip()}, tid = {tid.group(1).strip()}",
            'kernel_details': kernel_details.group(1).strip() if kernel_details else None
        })

    except Exception as e:
        print(f"Skipping line {i} due to error: {e}")
        skipped += 1
        continue

# Create DataFrame
df = pd.DataFrame(synthetic_data)

print(f"Parsed {len(df)} logs successfully.")
print(f"Skipped {skipped} logs due to missing or malformed fields.")

Parsed 1000 logs successfully.
Skipped 0 logs due to missing or malformed fields.


In [None]:
df = df[:10000]

In [None]:
df

Unnamed: 0,timestamp,time_elapsed,host_name,event_name,cpu_id,procname,process_details,kernel_details
0,12:00:00.100000000,0.000001234,synthetic-host,syscall_entry_open,1,nginx,"pid = 4567, tid = 4567","filename = 140735903668640, flags = 0, mode = ..."
1,12:00:00.100001500,0.000000789,synthetic-host,syscall_exit_open,1,nginx,"pid = 4567, tid = 4567",ret = 3
2,12:00:00.100002500,0.000000900,synthetic-host,syscall_entry_read,1,nginx,"pid = 4567, tid = 4567","fd = 3, count = 1024"
3,12:00:00.100003300,0.000000800,synthetic-host,syscall_exit_read,1,nginx,"pid = 4567, tid = 4567","ret = 512, buf = 140735903668704"
4,12:00:00.100004100,0.000001200,synthetic-host,syscall_entry_write,2,systemd,"pid = 1001, tid = 1001","fd = 1, buf = 140735903669000, count = 128"
...,...,...,...,...,...,...,...,...
995,13:45:10.123552924,0.000001011,synthetic-host,syscall_exit_fadvise64,1,data-loop.0,"pid = 2932, tid = 2972",ret = 0
996,13:45:10.123553935,0.000001011,synthetic-host,syscall_entry_timerfd_settime,1,data-loop.0,"pid = 2932, tid = 2972","ufd = 67, flags = 0, new_value = 0x7fffabc02000"
997,13:45:10.123554946,0.000001011,synthetic-host,syscall_exit_timerfd_settime,1,data-loop.0,"pid = 2932, tid = 2972",ret = 0
998,13:45:10.123555957,0.000001011,synthetic-host,syscall_entry_ioctl,3,llvmpipe,"pid = 3154, tid = 3213","fd = 13, cmd = 0x1234, arg = 0x7fffabc03000"


In [None]:
import csv

# Force timestamp and time_elapsed to be strings
df['timestamp'] = df['timestamp'].apply(str)
df['time_elapsed'] = df['time_elapsed'].apply(str)

# Optionally, wrap them in single quotes to prevent Excel from reformatting
df['timestamp'] = df['timestamp'].apply(lambda x: f"'{x}")
df['time_elapsed'] = df['time_elapsed'].apply(lambda x: f"'{x}")

# Export with quoted string fields
df.to_csv('synthetic_linux_logs_1K_o4-mini.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)