### Library imports

In [1]:
import json
import csv
import os
import re

### Extract the data to a csv files from raw data

In [2]:
def extract_syscalls_to_csv(input_file, output_file=None):
    """
    Reads a JSON log file and extracts syscall information to CSV.
    
    Args:
        input_file: Path to the input JSON log file
        output_file: Path to the output CSV file (optional, defaults to input filename + .csv)
    
    Returns:
        Number of records processed
    """
    # Generate output filename if not provided
    if output_file is None:
        base_name = os.path.splitext(input_file)[0]
        output_file = base_name + '_syscalls.csv'
    
    records = []
    skipped_lines = 0
    current_run = None  # Track the current run number
    
    # Read and parse the file
    with open(input_file, 'r') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Check if this line indicates a new run
            run_match = re.match(r'=== Run (\d+)/', line)
            if run_match:
                current_run = int(run_match.group(1))
                continue
            
            # Try to parse as JSON
            try:
                data = json.loads(line)
                
                # Check if it's a dict and starts with 'comm' field
                if isinstance(data, dict) and 'comm' in data:
                    # Check if first key is 'comm' by converting to string and checking
                    first_key = next(iter(data.keys()))
                    
                    if first_key == 'comm' and 'syscall' in data and 'ret' in data and 'parameters' in data:
                        records.append({
                            'run': current_run,
                            'syscall': data['syscall'],
                            'Ret': data['ret'],
                            'parameters': data['parameters']
                        })
                    else:
                        skipped_lines += 1
                else:
                    skipped_lines += 1
                    
            except (json.JSONDecodeError, ValueError):
                # Not a valid JSON line, skip it
                skipped_lines += 1
                continue
    
    # Write to CSV
    if records:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['run', 'syscall', 'Ret', 'parameters']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            writer.writerows(records)
        
        print(f"Successfully processed {len(records)} records")
        print(f"Skipped {skipped_lines} lines")
        print(f"Output written to: {output_file}")
    else:
        print("No valid records found in the file")
    
    return len(records)

### Go through all the files and extract to csv files

In [3]:
path = "../data"

In [4]:
data = {}
for root, dirs, files in os.walk(path):
    for filename in files:
        if not filename.lower().endswith('.json'):
            continue
        path_parts = os.path.normpath(root).split(os.sep)
        if path_parts[-2] not in data:
            data[path_parts[-2]] = {}
        data[path_parts[-2]][os.path.join(root, filename)] = []

In [5]:
for function_types in data:
    for files in data[function_types]:
        extract_syscalls_to_csv(files)

Successfully processed 43227 records
Skipped 580 lines
Output written to: ../data/func_invc_present/20251123_012402/101_logs_sentiment-analyzer-65c4b88fb5-wzsdm_syscalls.csv
Successfully processed 94311 records
Skipped 651 lines
Output written to: ../data/func_invc_present/20251123_012402/105_logs_kmeans-clustering-code-type-6769648db9-clcs7_syscalls.csv
Successfully processed 36455 records
Skipped 328 lines
Output written to: ../data/func_invc_present/20251123_012402/103_logs_time-series-forecaster-58db55d69-n58zr_syscalls.csv
Successfully processed 123624 records
Skipped 1063 lines
Output written to: ../data/func_invc_present/20251123_012402/107_logs_kmeans-clustering-fileop-type-69b75d769-qkq2d_syscalls.csv
Successfully processed 119054 records
Skipped 620 lines
Output written to: ../data/func_invc_present/20251123_012402/108_logs_kmeans-clustering-info-type-c49dcc8cf-48pmg_syscalls.csv
Successfully processed 70200 records
Skipped 300 lines
Output written to: ../data/func_invc_prese