# 🔍 SYSMON STRUCTURE CONSISTENCY ANALYZER

This notebook performs comprehensive structure consistency analysis on Windows Sysmon events stored in JSONL format from Elasticsearch. The analysis focuses on understanding XML structure patterns, field consistency, and schema variation across different EventIDs.

**Target File**: `-ds-logs-windows-sysmon_operational-default-2025-05-04-000001.jsonl`  
**Analysis Type**: 2B-SYSMON  
**Purpose**: Analyze structure consistency and schema patterns for robust CSV conversion

**Key Analysis Areas**:
- XML structure fingerprinting and pattern detection
- EventID-specific schema consistency analysis
- Field co-occurrence and dependency patterns
- Structure variation analysis and outlier detection
- Schema evolution and consistency metrics
- Processing pipeline recommendations

## 1. Import Required Libraries

In [9]:
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import hashlib
import random
import os
from datetime import datetime
from collections import defaultdict, Counter
import re

## 2. Analysis Configuration and Logging Setup

In [10]:
# Analysis Configuration
ANALYSIS_TYPE = "2b-sysmon"
SAMPLE_SIZE = 200_000  # Number of samples to analyze
TARGET_FILE = "-ds-logs-windows-sysmon_operational-default-2025-05-04-000001.jsonl"

# Create organized output directory structure
outputs_base_dir = "outputs"
analysis_outputs_dir = f"{outputs_base_dir}/{ANALYSIS_TYPE}"
os.makedirs(analysis_outputs_dir, exist_ok=True)

# Setup logging
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f"{analysis_outputs_dir}/{ANALYSIS_TYPE}_structure_analysis_{timestamp}.log"
results_filename = f"{analysis_outputs_dir}/{ANALYSIS_TYPE}_structure_results_{timestamp}.json"

def log_print(message):
    """Print and log messages"""
    print(message)
    with open(log_filename, 'a', encoding='utf-8') as f:
        f.write(message + '\n')

# Initialize log file
log_print("SYSMON STRUCTURE CONSISTENCY ANALYSIS")
log_print(f"Analysis Type: {ANALYSIS_TYPE.upper()}")
log_print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
log_print(f"Target File: {TARGET_FILE}")
log_print("=" * 80)
log_print("")

SYSMON STRUCTURE CONSISTENCY ANALYSIS
Analysis Type: 2B-SYSMON
Generated: 2025-06-29 11:36:37
Target File: -ds-logs-windows-sysmon_operational-default-2025-05-04-000001.jsonl



## 3. XML Parsing and Structure Analysis Utilities

In [11]:
def sanitize_xml(xml_str):
    """Clean invalid characters and repair XML structure"""
    try:
        # Remove non-printable characters
        cleaned = ''.join(c for c in xml_str if 31 < ord(c) < 127 or c in '\t\n\r')
        # Fix common XML issues using BeautifulSoup's parser
        return BeautifulSoup(cleaned, "xml").prettify()
    except:
        return xml_str  # Return original if cleaning fails

def generate_structure_fingerprint(event_id, fields_dict):
    """Generate a hash fingerprint for the structure pattern"""
    # Create structure signature: EventID + sorted field names + field types
    structure_elements = [str(event_id)]
    
    if fields_dict:
        # Add field names and their presence (not values)
        for field_name in sorted(fields_dict.keys()):
            field_value = fields_dict[field_name]
            field_type = "present" if field_value is not None else "null"
            structure_elements.append(f"{field_name}:{field_type}")
    
    # Create hash
    structure_string = "|".join(structure_elements)
    return hashlib.md5(structure_string.encode()).hexdigest()

def parse_sysmon_event_detailed(xml_str):
    """Parse XML with detailed structure analysis"""
    try:
        # Clean XML first
        clean_xml = sanitize_xml(xml_str)
        
        # Parse with explicit namespace
        namespaces = {'ns': 'http://schemas.microsoft.com/win/2004/08/events/event'}
        root = ET.fromstring(clean_xml)
        
        # System section
        system = root.find('ns:System', namespaces)
        if not system:
            return None, None, {}, None

        event_id_elem = system.find('ns:EventID', namespaces)
        computer_elem = system.find('ns:Computer', namespaces)
        
        event_id = int(event_id_elem.text) if event_id_elem is not None and event_id_elem.text else None
        computer = computer_elem.text if computer_elem is not None and computer_elem.text else None

        # EventData section - extract all fields with detailed info
        event_data = root.find('ns:EventData', namespaces)
        fields = {}
        if event_data:
            for data in event_data.findall('ns:Data', namespaces):
                name = data.get('Name')
                if name:
                    fields[name] = data.text if data.text else None

        # Generate structure fingerprint
        fingerprint = generate_structure_fingerprint(event_id, fields)
        
        return event_id, computer, fields, fingerprint

    except Exception as e:
        return None, None, {}, None

print("✅ XML parsing and structure analysis utilities loaded")

✅ XML parsing and structure analysis utilities loaded


## 4. Data Loading and Sampling

In [12]:
# Start section logging
log_print("\n" + "=" * 80)
log_print("SECTION 3: DATA LOADING AND SAMPLING")
log_print("=" * 80)
log_print("")

# Count total records
log_print(f"🔄 Loading samples from: {TARGET_FILE}")
log_print(f"📊 Counting total records...")

total_records = 0
with open(TARGET_FILE, 'r') as f:
    for line in f:
        total_records += 1

log_print(f"📈 Found {total_records:,} total records")

# Stratified sampling approach - sample every N records for better coverage
sample_interval = max(1, total_records // SAMPLE_SIZE)
log_print(f"🎯 Sampling every {sample_interval} records for stratified coverage")

# Collect samples
collected_samples = []
sample_count = 0

with open(TARGET_FILE, 'r') as f:
    for line_number, line in enumerate(f):
        if line_number % sample_interval == 0 and sample_count < SAMPLE_SIZE:
            try:
                event = json.loads(line)
                if 'event' in event and 'original' in event['event']:
                    collected_samples.append((line_number, event))
                    sample_count += 1
            except json.JSONDecodeError:
                continue

log_print(f"✅ Collected {len(collected_samples):,} samples for analysis")
log_print(f"📊 SAMPLING SUMMARY:")
log_print(f"   • Total file records: {total_records:,}")
log_print(f"   • Samples collected: {len(collected_samples):,}")
log_print(f"   • Coverage ratio: {(len(collected_samples)/total_records)*100:.2f}%")

# End section logging
log_print("\n" + "-" * 60 + " END SECTION " + "-" * 60)
log_print("")


SECTION 3: DATA LOADING AND SAMPLING

🔄 Loading samples from: -ds-logs-windows-sysmon_operational-default-2025-05-04-000001.jsonl
📊 Counting total records...
📈 Found 570,078 total records
🎯 Sampling every 2 records for stratified coverage
✅ Collected 200,000 samples for analysis
📊 SAMPLING SUMMARY:
   • Total file records: 570,078
   • Samples collected: 200,000
   • Coverage ratio: 35.08%

------------------------------------------------------------ END SECTION ------------------------------------------------------------



## 5. Structure Pattern Detection

In [13]:
# Start section logging
log_print("\n" + "=" * 80)
log_print("SECTION 4: STRUCTURE PATTERN DETECTION")
log_print("=" * 80)
log_print("")

log_print("🔍 GENERATING STRUCTURE FINGERPRINTS")
log_print("=" * 50)

# Structure analysis containers
structure_patterns = {}  # fingerprint -> pattern info
eventid_patterns = defaultdict(set)  # eventid -> set of fingerprints
pattern_samples = {}  # fingerprint -> sample record
parsing_stats = {'success': 0, 'errors': 0}

log_print(f"📋 Processing samples...")

# Process samples
for idx, (line_number, event) in enumerate(collected_samples):
    if idx % 1000 == 0:
        log_print(f"   Processed {idx} / {len(collected_samples)} records")
    
    try:
        xml_content = event['event']['original']
        event_id, computer, fields, fingerprint = parse_sysmon_event_detailed(xml_content)
        
        if event_id is not None and fingerprint:
            parsing_stats['success'] += 1
            
            # Track pattern
            if fingerprint not in structure_patterns:
                structure_patterns[fingerprint] = {
                    'count': 0,
                    'event_ids': set(),
                    'computers': set(),
                    'field_names': set(),
                    'field_count': len(fields)
                }
                pattern_samples[fingerprint] = {
                    'event_id': event_id,
                    'computer': computer,
                    'fields': fields,
                    '@timestamp': event.get('@timestamp', None)
                }
            
            # Update pattern info
            pattern = structure_patterns[fingerprint]
            pattern['count'] += 1
            pattern['event_ids'].add(event_id)
            if computer:
                pattern['computers'].add(computer)
            pattern['field_names'].update(fields.keys())
            
            # Track EventID associations
            eventid_patterns[event_id].add(fingerprint)
            
        else:
            parsing_stats['errors'] += 1
            
    except Exception:
        parsing_stats['errors'] += 1

log_print(f"✅ Fingerprinting complete!")
log_print(f"📊 STRUCTURE ANALYSIS RESULTS:")
log_print(f"   • Unique structure patterns found: {len(structure_patterns)}")
log_print(f"   • Records analyzed: {len(collected_samples):,}")
log_print(f"   • Parsing success: {parsing_stats['success']:,} ({(parsing_stats['success']/len(collected_samples))*100:.1f}%)")
log_print(f"   • Parsing errors: {parsing_stats['errors']:,} ({(parsing_stats['errors']/len(collected_samples))*100:.1f}%)")

# Pattern frequency analysis
pattern_frequencies = [(fp, info['count']) for fp, info in structure_patterns.items()]
pattern_frequencies.sort(key=lambda x: x[1], reverse=True)

if pattern_frequencies:
    most_common_count = pattern_frequencies[0][1]
    log_print(f"   • Most common pattern frequency: {most_common_count:,} records")

# End section logging
log_print("\n" + "-" * 60 + " END SECTION " + "-" * 60)
log_print("")


SECTION 4: STRUCTURE PATTERN DETECTION

🔍 GENERATING STRUCTURE FINGERPRINTS
📋 Processing samples...
   Processed 0 / 200000 records
   Processed 1000 / 200000 records
   Processed 2000 / 200000 records
   Processed 3000 / 200000 records
   Processed 4000 / 200000 records
   Processed 5000 / 200000 records
   Processed 6000 / 200000 records
   Processed 7000 / 200000 records
   Processed 8000 / 200000 records
   Processed 9000 / 200000 records
   Processed 10000 / 200000 records
   Processed 11000 / 200000 records
   Processed 12000 / 200000 records
   Processed 13000 / 200000 records
   Processed 14000 / 200000 records
   Processed 15000 / 200000 records
   Processed 16000 / 200000 records
   Processed 17000 / 200000 records
   Processed 18000 / 200000 records
   Processed 19000 / 200000 records
   Processed 20000 / 200000 records
   Processed 21000 / 200000 records
   Processed 22000 / 200000 records
   Processed 23000 / 200000 records
   Processed 24000 / 200000 records
   Processed

## 6. EventID-Specific Structure Analysis

In [14]:
# Start section logging
log_print("\n" + "=" * 80)
log_print("SECTION 5: EVENTID-SPECIFIC STRUCTURE ANALYSIS")
log_print("=" * 80)
log_print("")

log_print("📋 EVENTID-SPECIFIC STRUCTURE ANALYSIS")
log_print("=" * 60)

# Sysmon EventID descriptions
eventid_descriptions = {
    1: "Process Creation",
    2: "File Creation Time Changed", 
    3: "Network Connection",
    4: "Sysmon Service State Changed",
    5: "Process Terminated",
    6: "Driver Loaded",
    7: "Image/Library Loaded",
    8: "Create Remote Thread",
    9: "Raw Access Read",
    10: "Process Access",
    11: "File Create",
    12: "Registry Event (Object create/delete)",
    13: "Registry Event (Value Set)",
    15: "File Create Stream Hash",
    17: "Pipe Event (Pipe Created)",
    18: "Pipe Event (Pipe Connected)",
    22: "DNS Event (DNS query)",
    23: "File Delete (File Delete archived)",
    24: "Clipboard Change (New content in clipboard)",
    25: "Process Tampering (Process image change)"
}

# Analyze structure consistency per EventID
for event_id in sorted(eventid_patterns.keys()):
    patterns = eventid_patterns[event_id]
    description = eventid_descriptions.get(event_id, "Unknown EventID")
    
    log_print(f"\n🎯 EventID {event_id} - {description}")
    log_print("-" * 50)
    
    if len(patterns) == 1:
        # Single consistent pattern
        fingerprint = list(patterns)[0]
        pattern_info = structure_patterns[fingerprint]
        log_print(f"✅ CONSISTENT STRUCTURE - Single pattern detected")
        log_print(f"   • Records: {pattern_info['count']:,}")
        log_print(f"   • Field count: {pattern_info['field_count']}")
        log_print(f"   • Computers: {len(pattern_info['computers'])}")
        log_print(f"   • Fields: {sorted(list(pattern_info['field_names']))[:10]}{'...' if len(pattern_info['field_names']) > 10 else ''}")
    else:
        # Multiple patterns - inconsistent structure
        log_print(f"⚠️  INCONSISTENT STRUCTURE - {len(patterns)} patterns detected")
        
        # Sort patterns by frequency for this EventID
        eventid_pattern_freq = []
        for fp in patterns:
            # Count how many records of this EventID use this pattern
            eventid_count = sum(1 for sample_fp, sample_info in pattern_samples.items() 
                              if sample_fp == fp and sample_info['event_id'] == event_id)
            eventid_pattern_freq.append((fp, structure_patterns[fp]['count'], eventid_count))
        
        eventid_pattern_freq.sort(key=lambda x: x[2], reverse=True)
        
        for idx, (fp, total_count, eventid_count) in enumerate(eventid_pattern_freq[:3]):
            pattern_info = structure_patterns[fp]
            log_print(f"   Pattern {idx+1}: {eventid_count:,} records")
            log_print(f"     • Field count: {pattern_info['field_count']}")
            log_print(f"     • Total usage: {total_count:,} records")
            log_print(f"     • Fingerprint: {fp[:16]}...")
        
        if len(eventid_pattern_freq) > 3:
            log_print(f"   ... and {len(eventid_pattern_freq) - 3} more patterns")

# End section logging
log_print("\n" + "-" * 60 + " END SECTION " + "-" * 60)
log_print("")


SECTION 5: EVENTID-SPECIFIC STRUCTURE ANALYSIS

📋 EVENTID-SPECIFIC STRUCTURE ANALYSIS

🎯 EventID 1 - Process Creation
--------------------------------------------------
✅ CONSISTENT STRUCTURE - Single pattern detected
   • Records: 605
   • Field count: 23
   • Computers: 4
   • Fields: ['CommandLine', 'Company', 'CurrentDirectory', 'Description', 'FileVersion', 'Hashes', 'Image', 'IntegrityLevel', 'LogonGuid', 'LogonId']...

🎯 EventID 2 - File Creation Time Changed
--------------------------------------------------
✅ CONSISTENT STRUCTURE - Single pattern detected
   • Records: 30
   • Field count: 9
   • Computers: 4
   • Fields: ['CreationUtcTime', 'Image', 'PreviousCreationUtcTime', 'ProcessGuid', 'ProcessId', 'RuleName', 'TargetFilename', 'User', 'UtcTime']

🎯 EventID 3 - Network Connection
--------------------------------------------------
✅ CONSISTENT STRUCTURE - Single pattern detected
   • Records: 8,269
   • Field count: 18
   • Computers: 4
   • Fields: ['DestinationHostname

## 7. Detailed Pattern Analysis and Sample Display

In [15]:
# Start section logging
log_print("\n" + "=" * 80)
log_print("SECTION 6: DETAILED PATTERN ANALYSIS")
log_print("=" * 80)
log_print("")

log_print("📋 DETAILED PATTERN ANALYSIS")
log_print("=" * 60)

# Show top patterns with detailed analysis
top_patterns = pattern_frequencies[:5]  # Top 5 most common patterns

for idx, (fingerprint, count) in enumerate(top_patterns, 1):
    pattern_info = structure_patterns[fingerprint]
    sample = pattern_samples[fingerprint]
    
    percentage = (count / parsing_stats['success']) * 100
    
    log_print(f"\n🔍 PATTERN #{idx}")
    log_print("-" * 40)
    log_print(f"📊 Frequency: {count:,} records ({percentage:.2f}%)")
    log_print(f"🔑 Structure Hash: {fingerprint[:16]}...")
    log_print(f"🎯 EventID(s): {sorted(list(pattern_info['event_ids']))}")
    log_print(f"📏 Field count: {pattern_info['field_count']}")
    log_print(f"🖥️ Computer count: {len(pattern_info['computers'])}")
    
    # Show field names
    field_names = sorted(list(pattern_info['field_names']))
    log_print(f"📋 Fields: {field_names}")
    
    # Show sample record
    log_print(f"\n📄 Sample Record:")
    log_print(f"   EventID: {sample['event_id']}")
    log_print(f"   Computer: {sample['computer']}")
    log_print(f"   @timestamp: {sample['@timestamp']}")
    log_print(f"   Fields:")
    
    for field_name, field_value in sample['fields'].items():
        # Truncate long values for readability
        display_value = str(field_value)[:80] + "..." if field_value and len(str(field_value)) > 80 else field_value
        log_print(f"     • {field_name:20s}: {display_value}")

# Pattern classification
log_print(f"\n🏷️ PATTERN CLASSIFICATION:")
log_print("=" * 40)

# Classify patterns by frequency
total_patterns = len(structure_patterns)
common_threshold = parsing_stats['success'] * 0.05  # 5% threshold
rare_threshold = parsing_stats['success'] * 0.01    # 1% threshold

common_patterns = sum(1 for _, count in pattern_frequencies if count >= common_threshold)
uncommon_patterns = sum(1 for _, count in pattern_frequencies if rare_threshold <= count < common_threshold)
rare_patterns = sum(1 for _, count in pattern_frequencies if count < rare_threshold)

log_print(f"• COMMON patterns (≥5%): {common_patterns}")
log_print(f"• UNCOMMON patterns (1-5%): {uncommon_patterns}")
log_print(f"• RARE patterns (<1%): {rare_patterns}")
log_print(f"• Total patterns: {total_patterns}")

# End section logging
log_print("\n" + "-" * 60 + " END SECTION " + "-" * 60)
log_print("")


SECTION 6: DETAILED PATTERN ANALYSIS

📋 DETAILED PATTERN ANALYSIS

🔍 PATTERN #1
----------------------------------------
📊 Frequency: 89,455 records (44.73%)
🔑 Structure Hash: 4dc5ca65df001f87...
🎯 EventID(s): [12]
📏 Field count: 8
🖥️ Computer count: 4
📋 Fields: ['EventType', 'Image', 'ProcessGuid', 'ProcessId', 'RuleName', 'TargetObject', 'User', 'UtcTime']

📄 Sample Record:
   EventID: 12
   Computer: 
   diskjockey.boombox.local
  
   @timestamp: 2025-05-04T11:30:05.479Z
   Fields:
     • RuleName            : 
   -
  
     • EventType           : 
   CreateKey
  
     • UtcTime             : 
   2025-05-04 11:30:05.479
  
     • ProcessGuid         : 
   {acb80d05-4f9d-6817-3800-000000001600}
  
     • ProcessId           : 
   2716
  
     • Image               : 
   C:\Windows\Sysmon64.exe
  
     • TargetObject        : 
   HKU\.DEFAULT\Software\Microsoft\SystemCertificates\Disallowed
  
     • User                : 
   NT AUTHORITY\SYSTEM
  

🔍 PATTERN #2
---------------------

## 8. Field Co-occurrence Analysis

In [16]:
# Start section logging
log_print("\n" + "=" * 80)
log_print("SECTION 7: FIELD CO-OCCURRENCE ANALYSIS")
log_print("=" * 80)
log_print("")

log_print("🔗 FIELD CO-OCCURRENCE ANALYSIS")
log_print("=" * 50)

# Collect field statistics
field_counts = Counter()
field_combinations = Counter()
eventid_field_matrix = defaultdict(lambda: defaultdict(int))

# Process all samples for field statistics
for fingerprint, pattern_info in structure_patterns.items():
    field_set = pattern_info['field_names']
    record_count = pattern_info['count']
    event_ids = pattern_info['event_ids']
    
    # Count individual fields
    for field in field_set:
        field_counts[field] += record_count
    
    # Count field combinations (as frozenset for consistency)
    field_combination = frozenset(field_set)
    field_combinations[field_combination] += record_count
    
    # Build EventID-field matrix
    for event_id in event_ids:
        for field in field_set:
            eventid_field_matrix[event_id][field] += record_count

log_print(f"📊 Field presence analysis:")
log_print(f"   • Total unique field names: {len(field_counts)}")
log_print(f"   • Unique field combinations: {len(field_combinations)}")

# Most common fields
log_print(f"\n📈 Most common fields (top 15):")
for field, count in field_counts.most_common(15):
    percentage = (count / parsing_stats['success']) * 100
    log_print(f"   {field:30s} {count:8,} ({percentage:5.1f}%)")

# Field combinations
log_print(f"\n🎯 Most common field combinations (top 8):")
for field_combo, count in field_combinations.most_common(8):
    percentage = (count / parsing_stats['success']) * 100
    combo_size = len(field_combo)
    log_print(f"   Combination with {combo_size:2d} fields: {count:8,} records ({percentage:5.1f}%)")

# EventID-specific field analysis
log_print(f"\n📋 EventID-specific field patterns:")
for event_id in sorted(eventid_field_matrix.keys()):
    fields_for_eventid = eventid_field_matrix[event_id]
    total_records_eventid = sum(fields_for_eventid.values()) // len(fields_for_eventid) if fields_for_eventid else 0
    
    if total_records_eventid > 0:
        description = eventid_descriptions.get(event_id, "Unknown")
        log_print(f"\n   EventID {event_id} ({description}):")
        
        # Show top fields for this EventID
        eventid_field_list = [(field, count) for field, count in fields_for_eventid.items()]
        eventid_field_list.sort(key=lambda x: x[1], reverse=True)
        
        for field, count in eventid_field_list[:8]:  # Top 8 fields
            percentage = (count / total_records_eventid) * 100 if total_records_eventid > 0 else 0
            log_print(f"     • {field:25s}: {count:6,} ({percentage:5.1f}%)")
        
        if len(eventid_field_list) > 8:
            log_print(f"     ... and {len(eventid_field_list) - 8} more fields")

# End section logging
log_print("\n" + "-" * 60 + " END SECTION " + "-" * 60)
log_print("")


SECTION 7: FIELD CO-OCCURRENCE ANALYSIS

🔗 FIELD CO-OCCURRENCE ANALYSIS
📊 Field presence analysis:
   • Total unique field names: 68
   • Unique field combinations: 17

📈 Most common fields (top 15):
   UtcTime                         200,000 (100.0%)
   RuleName                        199,998 (100.0%)
   ProcessId                       172,566 ( 86.3%)
   Image                           172,566 ( 86.3%)
   ProcessGuid                     172,566 ( 86.3%)
   User                            172,566 ( 86.3%)
   EventType                       128,368 ( 64.2%)
   TargetObject                    127,428 ( 63.7%)
   Details                          37,973 ( 19.0%)
   Hashes                           33,043 ( 16.5%)
   Description                      31,867 ( 15.9%)
   FileVersion                      31,867 ( 15.9%)
   Product                          31,867 ( 15.9%)
   OriginalFileName                 31,867 ( 15.9%)
   Company                          31,867 ( 15.9%)

🎯 Most common fiel

## 9. Structure Consistency Report

In [17]:
# Start section logging
log_print("\n" + "=" * 80)
log_print("SECTION 8: STRUCTURE CONSISTENCY REPORT")
log_print("=" * 80)
log_print("")

log_print("📊 STRUCTURE CONSISTENCY REPORT")
log_print("=" * 60)

# Calculate consistency metrics
total_patterns = len(structure_patterns)
total_eventids = len(eventid_patterns)
total_records = parsing_stats['success']

# Coverage analysis
if pattern_frequencies:
    top_3_coverage = sum(count for _, count in pattern_frequencies[:3])
    top_5_coverage = sum(count for _, count in pattern_frequencies[:5])
    top_10_coverage = sum(count for _, count in pattern_frequencies[:min(10, len(pattern_frequencies))])
else:
    top_3_coverage = top_5_coverage = top_10_coverage = 0

# EventID consistency
consistent_eventids = sum(1 for eid, patterns in eventid_patterns.items() if len(patterns) == 1)
inconsistent_eventids = total_eventids - consistent_eventids

log_print(f"🔍 CONSISTENCY METRICS:")
log_print(f"   • Total unique structure patterns: {total_patterns}")
log_print(f"   • Total EventIDs analyzed: {total_eventids}")
log_print(f"   • Consistent EventIDs (single pattern): {consistent_eventids}")
log_print(f"   • Inconsistent EventIDs (multiple patterns): {inconsistent_eventids}")
log_print(f"   • Structure diversity ratio: {(total_patterns/total_records)*100:.3f}%")

log_print(f"\n📈 COVERAGE ANALYSIS:")
if total_records > 0:
    log_print(f"   • Top 3 patterns cover: {(top_3_coverage/total_records)*100:.1f}% of data")
    log_print(f"   • Top 5 patterns cover: {(top_5_coverage/total_records)*100:.1f}% of data")
    log_print(f"   • Top 10 patterns cover: {(top_10_coverage/total_records)*100:.1f}% of data")

# Determine overall assessment
consistency_ratio = consistent_eventids / total_eventids if total_eventids > 0 else 0
diversity_ratio = total_patterns / total_records if total_records > 0 else 1

if consistency_ratio >= 0.8 and diversity_ratio <= 0.01:
    assessment = "HIGHLY CONSISTENT"
    status_emoji = "🟢"
elif consistency_ratio >= 0.6 and diversity_ratio <= 0.05:
    assessment = "MODERATELY CONSISTENT"
    status_emoji = "🟡"
else:
    assessment = "INCONSISTENT - REQUIRES ATTENTION"
    status_emoji = "🔴"

log_print(f"\n{status_emoji} OVERALL ASSESSMENT: {assessment}")

# Processing recommendations
log_print(f"\n💡 PROCESSING RECOMMENDATIONS:")

if consistency_ratio >= 0.8:
    log_print(f"   🟢 High consistency - recommend EventID-specific processing")
    log_print(f"   🟢 Most EventIDs have single, stable structure patterns")
    log_print(f"   🟢 Standard field mapping approach will work well")
elif consistency_ratio >= 0.6:
    log_print(f"   🟡 Moderate consistency - recommend hybrid processing")
    log_print(f"   🟡 Primary pattern for each EventID with fallback handling")
    log_print(f"   🟡 Field validation and error handling required")
else:
    log_print(f"   🔴 Low consistency - recommend robust error handling")
    log_print(f"   🔴 Multiple processing pipelines may be needed")
    log_print(f"   🔴 Extensive field validation and schema flexibility required")

# Field handling strategies
always_present_fields = [field for field, count in field_counts.items() if count >= total_records * 0.95]
conditional_fields = [field for field, count in field_counts.items() if 0.5 <= (count/total_records) < 0.95]
rare_fields = [field for field, count in field_counts.items() if count < total_records * 0.5]

log_print(f"\n🛠️ FIELD HANDLING STRATEGIES:")
log_print(f"   • Always present fields ({len(always_present_fields)}): Standard extraction")
log_print(f"   • Conditional fields ({len(conditional_fields)}): Null handling required")
log_print(f"   • Rare fields ({len(rare_fields)}): Consider exclusion or special handling")

# Executive summary
log_print(f"\n🎯 EXECUTIVE SUMMARY:")
log_print(f"   Dataset shows {assessment.lower()} with {total_patterns} unique patterns.")
log_print(f"   {consistent_eventids}/{total_eventids} EventIDs have consistent structure.")
log_print(f"   Recommended approach: {'EventID-specific' if consistency_ratio >= 0.8 else 'Hybrid with validation' if consistency_ratio >= 0.6 else 'Robust multi-pattern'} processing.")

# End section logging
log_print("\n" + "-" * 60 + " END SECTION " + "-" * 60)
log_print("")


SECTION 8: STRUCTURE CONSISTENCY REPORT

📊 STRUCTURE CONSISTENCY REPORT
🔍 CONSISTENCY METRICS:
   • Total unique structure patterns: 20
   • Total EventIDs analyzed: 18
   • Consistent EventIDs (single pattern): 16
   • Inconsistent EventIDs (multiple patterns): 2
   • Structure diversity ratio: 0.010%

📈 COVERAGE ANALYSIS:
   • Top 3 patterns cover: 79.3% of data
   • Top 5 patterns cover: 97.1% of data
   • Top 10 patterns cover: 99.5% of data

🟢 OVERALL ASSESSMENT: HIGHLY CONSISTENT

💡 PROCESSING RECOMMENDATIONS:
   🟢 High consistency - recommend EventID-specific processing
   🟢 Most EventIDs have single, stable structure patterns
   🟢 Standard field mapping approach will work well

🛠️ FIELD HANDLING STRATEGIES:
   • Always present fields (2): Standard extraction
   • Conditional fields (6): Null handling required
   • Rare fields (60): Consider exclusion or special handling

🎯 EXECUTIVE SUMMARY:
   Dataset shows highly consistent with 20 unique patterns.
   16/18 EventIDs have con

## 10. Save Analysis Results

In [18]:
# Prepare results for JSON export
results = {
    'analysis_metadata': {
        'analysis_type': ANALYSIS_TYPE,
        'timestamp': datetime.now().isoformat(),
        'target_file': TARGET_FILE,
        'sample_size': len(collected_samples),
        'total_records': total_records
    },
    'parsing_statistics': parsing_stats,
    'structure_patterns': {
        'total_patterns': total_patterns,
        'pattern_frequencies': pattern_frequencies[:10],  # Top 10
        'pattern_classification': {
            'common': common_patterns,
            'uncommon': uncommon_patterns,
            'rare': rare_patterns
        }
    },
    'eventid_analysis': {
        'total_eventids': total_eventids,
        'consistent_eventids': consistent_eventids,
        'inconsistent_eventids': inconsistent_eventids,
        'eventid_pattern_counts': {str(eid): len(patterns) for eid, patterns in eventid_patterns.items()}
    },
    'field_analysis': {
        'total_unique_fields': len(field_counts),
        'always_present_fields': always_present_fields,
        'conditional_fields': conditional_fields[:20],  # Limit size
        'rare_fields': rare_fields[:20],  # Limit size
        'field_combinations': len(field_combinations)
    },
    'consistency_metrics': {
        'consistency_ratio': consistency_ratio,
        'diversity_ratio': diversity_ratio,
        'assessment': assessment,
        'coverage': {
            'top_3': (top_3_coverage/total_records)*100 if total_records > 0 else 0,
            'top_5': (top_5_coverage/total_records)*100 if total_records > 0 else 0,
            'top_10': (top_10_coverage/total_records)*100 if total_records > 0 else 0
        }
    }
}

# Save results
log_print(f"💾 Saving detailed analysis results...")
with open(results_filename, 'w') as f:
    json.dump(results, f, indent=2)

log_print(f"✅ Results saved to: {results_filename}")
log_print(f"📁 Output directory: {analysis_outputs_dir}")
log_print(f"🎉 Sysmon structure consistency analysis complete!")

print(f"\n📋 Analysis complete! Results saved to: {log_filename}")
print(f"📊 Detailed results saved to: {results_filename}")
print(f"📁 Output directory: {analysis_outputs_dir}")
print(f"🎉 Sysmon structure consistency analysis complete!")

💾 Saving detailed analysis results...
✅ Results saved to: outputs/2b-sysmon/2b-sysmon_structure_results_20250629_113637.json
📁 Output directory: outputs/2b-sysmon
🎉 Sysmon structure consistency analysis complete!

📋 Analysis complete! Results saved to: outputs/2b-sysmon/2b-sysmon_structure_analysis_20250629_113637.log
📊 Detailed results saved to: outputs/2b-sysmon/2b-sysmon_structure_results_20250629_113637.json
📁 Output directory: outputs/2b-sysmon
🎉 Sysmon structure consistency analysis complete!
