In [None]:
import os
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install --upgrade --quiet pip
%pip install --upgrade --quiet  langchain langchain_experimental langchain-openai pydantic==2.9.2 openai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.6/755.6 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13/13[0m [langchain_experimental]
[1A[2K

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from pydantic import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI

In [None]:
from typing import Optional, Union, Dict

In [None]:
import re

def parse_log_line(line: str) -> dict:
    # Extract timestamp
    timestamp_match = re.search(r"\[(.*?)\]", line)
    timestamp = timestamp_match.group(1) if timestamp_match else ""

    # Extract time elapsed
    time_elapsed_match = re.search(r"\((.*?)\)", line)
    time_elapsed = time_elapsed_match.group(1) if time_elapsed_match else ""

    # Extract host name and event name
    parts = line.split()
    host_name = parts[2] if len(parts) > 2 else ""
    event_name = parts[3].rstrip(":") if len(parts) > 3 else ""

    # Extract all {...} blocks
    curly_groups = re.findall(r"\{(.*?)\}", line)

    # Initialize fields
    cpu_id = "0"
    process_details = ""
    kernel_details = ""

    # Extract CPU ID from first group (if present)
    if len(curly_groups) >= 1:
        cpu_match = re.search(r"cpu_id\s*=\s*(\d+)", curly_groups[0])
        cpu_id = cpu_match.group(1) if cpu_match else "0"

    # Clean and assign process/kernel details (remove quotes)
    if len(curly_groups) >= 2:
        process_details = curly_groups[1].replace('"', '').replace("'", "").strip()
    if len(curly_groups) >= 3:
        kernel_details = curly_groups[2].replace('"', '').replace("'", "").strip()

    # Build formatted example string and escape all curly braces for PromptTemplate
    formatted = (
        f"timestamp:{timestamp}, time elapsed: (+{time_elapsed}), Host Name: {host_name}, "
        f"Event Name: {event_name}, Cpu Id: {cpu_id}, process_details: {process_details}, "
        f"kernel_details: {kernel_details}"
    )

    # Escape all braces for safe PromptTemplate use
    escaped_formatted = formatted.replace("{", "{{").replace("}", "}}")

    return {"example": escaped_formatted}

In [None]:
logs = []
with open("kernel_trace_1K.txt", "r") as file:
    for line in file:
        if line.strip():
            try:
                parsed = parse_log_line(line.strip())
                logs.append(parsed)
            except Exception as e:
                print(f"Error parsing line: {line}\n{e}")

In [None]:
logs

[{'example': 'timestamp:06:13:02.227908688, time elapsed: (++?.?????????), Host Name: mendax, Event Name: syscall_entry_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 553, upeer_addrlen = 246916502706640'},
 {'example': 'timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640'},
 {'example': 'timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50'},
 {'example': 'timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_details: procname = ela

In [None]:
def extract_unique_event_examples(examples):
    seen_events = set()
    unique_examples = []

    for item in examples:
        example_text = item.get("example", "")

        # Try to extract the Event Name
        try:
            event_name = example_text.split("Event Name:")[1].split(",")[0].strip()
        except IndexError:
            continue  # Skip if malformed

        if event_name not in seen_events:
            seen_events.add(event_name)
            unique_examples.append(item)

    return unique_examples

In [None]:
filtered_examples = extract_unique_event_examples(logs)

# Print unique event names and count
print(f"Total unique events: {len(filtered_examples)}")
for ex in filtered_examples:
    print(ex)

Total unique events: 48
{'example': 'timestamp:06:13:02.227908688, time elapsed: (++?.?????????), Host Name: mendax, Event Name: syscall_entry_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 553, upeer_addrlen = 246916502706640'}
{'example': 'timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640'}
{'example': 'timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50'}
{'example': 'timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_detail

In [None]:
def escape_curly_braces(examples):
    return [{"example": ex["example"].replace("{", "").replace("}", "}}")} for ex in examples]

filtered_examples = escape_curly_braces(filtered_examples)

In [None]:
import re

def remove_high_risk_examples(examples):
    safe_examples = []
    for ex in examples:
        example_text = ex["example"]

        # Remove examples with problematic patterns
        if any([
            re.search(r"\{.*?\}", example_text),               # Unescaped curly braces
            re.search(r"\[\s*\d+\s*\]\s*=", example_text),     # Patterns like [0] =
            "fds =" in example_text,                           # Explicit fds structures
            "events =" in example_text and "POLL" in example_text  # POLLIN/POLLOUT structures
        ]):
            continue  # Skip high-risk example
        safe_examples.append(ex)

    return safe_examples

In [None]:
filtered_examples = remove_high_risk_examples(filtered_examples)
print(f"Remaining safe examples: {len(filtered_examples)}")

Remaining safe examples: 45


In [None]:
filtered_examples = filtered_examples[1:]

In [None]:
filtered_examples = [
    row for row in filtered_examples
    if not row["example"].strip().endswith("kernel_details:")
]

In [None]:
# Print unique event names and count
print(f"Total unique events: {len(filtered_examples)}")
for ex in filtered_examples:
    print(ex)

Total unique events: 43
{'example': 'timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640'}
{'example': 'timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50'}
{'example': 'timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 2, arg = 50'}
{'example': 'timestamp:06:13:02.227916313, time elapsed: (++0.000001333), Host Name: mendax, Event Name: syscall_entry_getsockname, Cpu Id: 2, process_details: procname = elast

In [None]:
del filtered_examples[17]

In [None]:
del filtered_examples[30]

In [None]:
# Print unique event names and count
print(f"Total unique events: {len(filtered_examples)}")
i = 0
for ex in filtered_examples:
    print(f"i: {i}", ex)
    i += 1

Total unique events: 42
i: 0 {'example': 'timestamp:06:13:02.227912438, time elapsed: (++0.000003750), Host Name: mendax, Event Name: syscall_exit_accept, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 576, upeer_sockaddr = 246916502706648, upeer_addrlen = 246916502706640'}
i: 1 {'example': 'timestamp:06:13:02.227914646, time elapsed: (++0.000002208), Host Name: mendax, Event Name: syscall_entry_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: fd = 576, cmd = 3, arg = 50'}
i: 2 {'example': 'timestamp:06:13:02.227914980, time elapsed: (++0.000000334), Host Name: mendax, Event Name: syscall_exit_fcntl, Cpu Id: 2, process_details: procname = elasticsearch[m, pid = 11822, tid = 11859, kernel_details: ret = 2, arg = 50'}
i: 3 {'example': 'timestamp:06:13:02.227916313, time elapsed: (++0.000001333), Host Name: mendax, Event Name: syscall_entry_getsockname, Cpu Id: 2, process_detai

In [None]:
class LinuxLog(BaseModel):
    timestamp: str  # Log timestamp
    time_elapsed: Optional[str]  # Time elapsed since the previous log
    host_name: Optional[str]  # Host name
    event_name: Optional[str]  # Event name
    cpu_id: Optional[int]  # CPU ID where the event occurred
    process_details: Optional[str] # Details object
    kernel_details: Optional[str] # Details object

In [None]:
schema = LinuxLog.model_json_schema()

In [None]:
schema

{'properties': {'timestamp': {'title': 'Timestamp', 'type': 'string'},
  'time_elapsed': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'title': 'Time Elapsed'},
  'host_name': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'title': 'Host Name'},
  'event_name': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'title': 'Event Name'},
  'cpu_id': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
   'title': 'Cpu Id'},
  'process_details': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'title': 'Process Details'},
  'kernel_details': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'title': 'Kernel Details'}},
 'required': ['timestamp',
  'time_elapsed',
  'host_name',
  'event_name',
  'cpu_id',
  'process_details',
  'kernel_details'],
 'title': 'LinuxLog',
 'type': 'object'}

In [None]:
all_synthetic_results = []

In [None]:
import time
from pydantic import ValidationError
from openai import OpenAIError

total = 10
batch_size = 100
example_batch_size = 10
num_examples = len(filtered_examples)

# Metrics
total_requests = 0
total_tokens = 0

for i in range(total):
    print(f"[Batch {i+1}/{total}] Generating...")

    # Dynamically pick 10 examples per batch, loop if out of range
    start = (i * example_batch_size) % num_examples
    end = start + example_batch_size
    current_examples = filtered_examples[start:end]

    OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

    prompt_template = FewShotPromptTemplate(
        prefix= SYNTHETIC_FEW_SHOT_PREFIX,
        examples=current_examples,
        suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
        input_variables=["subject", "extra"],
        example_prompt=OPENAI_TEMPLATE,
    )

    synthetic_data_generator = create_openai_data_generator(
        output_schema=LinuxLog,
        llm=ChatOpenAI(
            model="gpt-4.1-mini", # Change model to GPT-3.5-Turbo
            model_kwargs={"top_p": 0.95},
            temperature=1,
            api_key=os.environ["OPENAI_API_KEY"]
        ),
        prompt=prompt_template,
    )

    try:
        synthetic_results = synthetic_data_generator.generate(
          subject="Linux Logs using LLTNG tool",
          extra="""You are an AI trained to generate valid realistic Linux LTTng logs in JSON format.
                   Each generated log must include the following fields: timestamp, time_elapsed, host_name, event_name, cpu_id, process_details, and kernel_details fields.
                    - process details should include only procname, pid, and tid fields with realistic values. No other field is permitted here. Example, process_details='procname = nginx, pid = 12542, tid = 12543'
                    - kernel_details should include relevant kernel parameters based on procname in LTTng format. Example kernel_details field should like, kernel_details='fd = 34, buf = 314159265358979, count = 1024'
                    - each process should have a relevant event_name, kernel parameters. Values must reflect realistic log content. Ensure all fields are filled and correctly typed.
                    - Omit any values that doesn't fit LTTng Linux log format. Do not generate any unrealistic values. Each field should have relevant values, timestamp values like 06:13:02.227946563, time_elapsed like (++0.000008417), host_name always synthetic_host,
                      event_name like syscall_entry_read, cpu_id like 1 containing cpu core numbers, process_details like procname = nginx, pid = 12542, tid = 12543, kernel_details like fd = 34, buf = 314159265358979, count = 1024
                    - Valid output format: timestamp='06:13:02.227946563', time_elapsed='(++0.000008417)', host_name='synthetic-host', event_name='syscall_entry_read', cpu_id=3, process_details='procname = nginx, pid = 12542, tid = 12543', kernel_details='fd = 34, buf = 314159265358979, count = 1024'
                    - Do not omit or leave any fields blank. The host_name value must always be synthetic-host
                """,
          runs=batch_size,
          safe=True,  # silently skips invalid output
        )
        all_synthetic_results.extend(synthetic_results)

        total_requests += 1
        estimated_tokens = batch_size * 1000
        total_tokens += estimated_tokens

        print(f"Batch {i+1}: {len(synthetic_results)} logs")
        print(f"Total Requests so far: {total_requests}")
        print(f"Estimated Tokens used so far: {total_tokens:,}")

    except ValidationError as e:
        print("Skipping invalid result due to validation error.")
        continue
    except OpenAIError as e:
        print(f"OpenAI Error: {e}")
        break

    time.sleep(61)

print(f"Total synthetic logs generated: {len(all_synthetic_results)}")
print(f"Final estimated usage: {total_tokens:,} tokens, {total_requests} API calls")

[Batch 1/1] Generating...


KeyboardInterrupt: 

In [None]:
all_synthetic_results

In [None]:
len(all_synthetic_results)

In [None]:
len(synthetic_results)

In [None]:
with open("synthetic_logs_1K_openAI.txt", "w") as f:
    for item in all_synthetic_results:
        f.write(str(item) + "\n")

In [None]:
import re
import pandas as pd

# Flatten the entries into raw strings
raw_entries = [str(entry) for entry in all_synthetic_results]

# Define a flexible regex that matches both formats
pattern = re.compile(
    r"timestamp='([^']+)'[\s,]*"
    r"time_elapsed='([^']+)'[\s,]*"
    r"host_name='([^']+)'[\s,]*"
    r"event_name='([^']+)'[\s,]*"
    r"cpu_id=(\d+)[\s,]*"
    r"process_details='([^']+)'[\s,]*"
    r"kernel_details='([^']+)'"
)

# Extract matching entries
parsed_logs = []
for entry in raw_entries:
    match = pattern.search(entry)
    if match:
        parsed_logs.append({
            "timestamp": match.group(1),
            "time_elapsed": match.group(2),
            "host_name": match.group(3),
            "event_name": match.group(4),
            "cpu_id": int(match.group(5)),
            "process_details": match.group(6),
            "kernel_details": match.group(7)
        })

# Convert to DataFrame
df = pd.DataFrame(parsed_logs)

In [None]:
df

In [None]:
df.to_csv("synthetic_linux_logs_1K_GPT-o4-mini.csv", index=False)