In [12]:
import re

template = "pid=<*> uid=<*> old-auid=<*> auid=<*> tty=<*> old-ses=<*> ses=<*> res=<*>"
log = "pid=1234 uid=1000 old-auid=4294967295 auid=1000 tty=(pts/1) old-ses=4294967295 ses=2 res=success"

def extract_variables(log, template):
    log = re.sub(r'\s+', ' ', log.strip()) # DS
    pattern_parts = template.split("<*>")
    pattern_parts_escaped = [re.escape(part) for part in pattern_parts]
    regex_pattern = "(.*?)".join(pattern_parts_escaped)
    regex = "^" + regex_pattern + "$"  
    matches = re.search(regex, log)
    if matches:
        return matches.groups()
    else:
        return None
    
extract_variables(log, template)

('1234', '1000', '4294967295', '1000', '(pts/1)', '4294967295', '2', 'success')

In [13]:
def get_logs(path):
    with open(path, "r") as f:
        logs = f.readlines()
        logs = [log.strip() for log in logs]
    return logs

logs = get_logs("audit/russelmitchell_intranet_server_audit.log")

In [14]:
import re
import pandas as pd

log_format = "<Type> msg=audit(<Time>): <Content>"

def match_templates(logs, templates, log_format):
    """Check if all logs match with the templates."""
    features = re.findall(r'<(.*?)>', log_format)
    log_format_wildcarded = re.sub('|'.join(map(re.escape, features)), '*', log_format)
    templates_incl_format = [log_format_wildcarded + template for template in templates]
    matched_templates_sorted = []
    for log in logs:
        matched = False
        for i, template in enumerate(templates):
            extraction = extract_variables(log, templates_incl_format[i])
            if extraction:
                matched_templates_sorted.append(template)
                matched = True
                break
        if not matched:
            raise ValueError(f"Unmatched log: {log}")
    print("All matched!")
    return matched_templates_sorted

df_templates = pd.read_csv("audit/russellmitchell_audit_templates.csv")
event_id = df_templates["EventID"].tolist()
templates = df_templates["EventTemplate"].tolist()
templates_dict = dict(zip(templates, event_id))


matched_templates_sorted = match_templates(logs, templates, log_format)

All matched!


In [15]:
import pandas as pd

def match_log_format(logs, log_format):
    features = re.findall(r'<(.*?)>', log_format)
    df = [] #pd.DataFrame(columns=["LineId"] + features + ["EventId", "EventTemplate"])
    log_format_wildcarded = re.sub('|'.join(map(re.escape, features)), '*', log_format)
    print(log_format_wildcarded)
    for i, log in enumerate(logs):
        extraction = extract_variables(log, log_format_wildcarded)
        if extraction:
            row = [i] + list(extraction)
            df.append(row)
        else:
            raise ValueError(f"Unmatched log: {log}")
    return df

matched_log_format = match_log_format(logs, log_format)
matched_log_format

<*> msg=audit(<*>): <*>


[[0,
  'type=USER_ACCT',
  '1642723741.072:375',
  'pid=10125 uid=0 auid=4294967295 ses=4294967295 msg=\'op=PAM:accounting acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success\''],
 [1,
  'type=CRED_ACQ',
  '1642723741.072:376',
  'pid=10125 uid=0 auid=4294967295 ses=4294967295 msg=\'op=PAM:setcred acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success\''],
 [2,
  'type=LOGIN',
  '1642723741.076:377',
  'pid=10125 uid=0 old-auid=4294967295 auid=0 tty=(none) old-ses=4294967295 ses=65 res=1'],
 [3,
  'type=USER_START',
  '1642723741.080:378',
  'pid=10125 uid=0 auid=0 ses=65 msg=\'op=PAM:session_open acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success\''],
 [4,
  'type=CRED_DISP',
  '1642723741.084:379',
  'pid=10125 uid=0 auid=0 ses=65 msg=\'op=PAM:setcred acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success\''],
 [5,
  'type=USER_END',
  '1642723741.084:380',
  'pid=10125 uid=0 auid=0 ses=65 

In [17]:
templates_structures = [mlf + [templates_dict[mts]] + [mts] for mlf, mts in zip(matched_log_format, matched_templates_sorted)]
df_structured = pd.DataFrame(templates_structures, columns=["LineId"] + re.findall(r'<(.*?)>', log_format) + ["EventId", "EventTemplate"])
df_structured
df_structured.to_csv("audit/russellmitchell_audit_structured.csv", index=False)