Hypothesis:

APTs adjust TTPs in response to defensive measures and signs of detection.

In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
path = os.path.split(os.getcwd())[0] + '/data/unraveled-apt/host-logs/audit/'
files = [path + x for x in os.listdir(path)]
d = dict()

In [3]:
for f in files:
    key = os.path.split(f)[1]
    d[key] = pd.read_csv(f, delimiter=';')
    
audit_df = pd.concat(d.values())

In [4]:
del d

On the last row, there appears to be some preceeding whitespace in the LogEvent column. Lets handle that:

In [5]:
for col in audit_df.columns:
    try:
        audit_df[col] = audit_df[col].str.strip()
    except:
        continue

In [6]:
pat = re.compile(r"msg=('.*')")

In [None]:
msg_df = audit_df.LogEvent.str.extract(r"msg=('.*')")
no_msg = audit_df.LogEvent.str.replace(r"msg=('.*')", repl='', regex=True)

We are having to do some funky business to retain the `msg` field. If we were to split the output below on spaces or `=`, it would mangle the message and split it up in a less than ideal way for maintaining data integrity.

In [14]:
msg_df.iloc[5][0]

'\'op=PAM:session_close acct="root" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/5 res=success\''

Here is what the `no_msg` series looks like now. We can proceeed with converting this into a DataFrame.

In [None]:
print(no_msg.iloc[111])
print(audit_df.LogEvent.iloc[111])

type=USER_START ts=1621862701.432 tsid=600 pid=15765 uid=0 auid=0 ses=3575 
type=USER_START ts=1621862701.432 tsid=600 pid=15765 uid=0 auid=0 ses=3575 msg='op=PAM:session_open acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success'


In [28]:
logs = no_msg.str.split()

In [47]:
expand_logs = logs.apply(lambda x: {b[0]: b[-1] for b in [a.split('=', maxsplit=1) for a in x]}).to_dict()
expand_logs[0]

{'type': 'DAEMON_START',
 'ts': '1621723084.486',
 'tsid': '7423',
 'op': 'start',
 'ver': '3.0',
 'format': 'enriched',
 'kernel': '5.4.0-kali4-amd64',
 'auid': '4294967295',
 'pid': '26701',
 'uid': '0',
 'ses': '4294967295',
 'subj': 'unconfined',
 'res': 'success',
 'AUID': '"unset"',
 'UID': '"root"'}

In [48]:
log_df = pd.DataFrame(expand_logs).T
log_df.head(6)

del expand_logs

In [None]:
log_df.columns  # no msg column

Index(['type', 'ts', 'tsid', 'op', 'ver', 'format', 'kernel', 'auid', 'pid',
       'uid', 'ses', 'subj', 'res', 'AUID', 'UID', 'audit_backlog_limit',
       'old', 'audit_failure', 'audit_backlog_wait_time', 'old-auid', 'tty',
       'old-ses', 'OLD-AUID', 'ID', 'gid', 'comm', 'exe', 'sig', 'GID'],
      dtype='object')

In [None]:
from io import StringIO

def parse_event_details(activity_text):
    """
    Extract structured fields from Windows Event Log Activity text.
    Returns a dictionary with parsed fields.
    """
    if pd.isna(activity_text):
        return {}
    
    details = {}
    
    # Extract the main activity description (first line before structured data)
    activity_match = re.match(r'^([^,\n]+)', activity_text)
    if activity_match:
        details['activity_description'] = activity_match.group(1).strip()
    
    # Extract Subject information
    subject_pattern = r'Subject:.*?Security ID:\s*([^\n]+).*?Account Name:\s*([^\n]+).*?Account Domain:\s*([^\n]+).*?Logon ID:\s*([^\n]+)'
    subject_match = re.search(subject_pattern, activity_text, re.DOTALL)
    if subject_match:
        details['subject_sid'] = subject_match.group(1).strip()
        details['subject_account'] = subject_match.group(2).strip()
        details['subject_domain'] = subject_match.group(3).strip()
        details['subject_logon_id'] = subject_match.group(4).strip()
    
    # Extract User information
    user_pattern = r'User:.*?Security ID:\s*([^\n]+).*?Account Name:\s*([^\n]+).*?Account Domain:\s*([^\n]+)'
    user_match = re.search(user_pattern, activity_text, re.DOTALL)
    if user_match:
        details['user_sid'] = user_match.group(1).strip()
        details['user_account'] = user_match.group(2).strip()
        details['user_domain'] = user_match.group(3).strip()
    
    # Extract Process information
    process_pattern = r'Process Information:.*?Process ID:\s*([^\n]+).*?Process Name:\s*([^\n]+)'
    process_match = re.search(process_pattern, activity_text, re.DOTALL)
    if process_match:
        details['process_id'] = process_match.group(1).strip()
        details['process_name'] = process_match.group(2).strip().strip('"')
    
    return details


def clean_unraveled_csv(csv_path, output_path=None):
    """
    Read and parse Unraveled dataset CSV with embedded structured text.
    
    Parameters:
    -----------
    csv_path : str
        Path to the input CSV file
    output_path : str, optional
        Path to save the cleaned CSV. If None, returns DataFrame only.
    
    Returns:
    --------
    pd.DataFrame
        Cleaned and parsed dataframe
    """
    
    # Read CSV with proper handling of multi-line fields
    df = pd.read_csv(csv_path, 
                     encoding='utf-8',
                     on_bad_lines='skip')  # Skip malformed lines
    
    print(f"Original shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}\n")
    
    # Parse the Activity column into structured fields
    print("Parsing Activity column...")
    parsed_details = df['Activity'].apply(parse_event_details)
    details_df = pd.DataFrame(parsed_details.tolist())
    
    # Combine with original dataframe (excluding raw Activity column)
    df_clean = pd.concat([
        df.drop('Activity', axis=1),
        details_df
    ], axis=1)
    
    # Convert Date and Time to datetime
    if 'Date and Time' in df_clean.columns:
        df_clean['DateTime'] = pd.to_datetime(df_clean['Date and Time'], 
                                               format='%m/%d/%Y %I:%M:%S %p',
                                               errors='coerce')
        df_clean = df_clean.sort_values('DateTime')
    
    # Convert Event ID to numeric
    if 'Event ID' in df_clean.columns:
        df_clean['Event ID'] = pd.to_numeric(df_clean['Event ID'], errors='coerce')
    
    print(f"Cleaned shape: {df_clean.shape}")
    print(f"New columns added: {[col for col in df_clean.columns if col not in df.columns]}\n")
    
    # Save if output path provided
    if output_path:
        df_clean.to_csv(output_path, index=False)
        print(f"Saved cleaned data to: {output_path}")
    
    return df_clean


def extract_feature_summary(df):
    """
    Extract summary statistics useful for behavioral analysis.
    """
    summary = {
        'total_events': len(df),
        'unique_accounts': df['subject_account'].nunique() if 'subject_account' in df.columns else 0,
        'unique_processes': df['process_name'].nunique() if 'process_name' in df.columns else 0,
        'unique_event_ids': df['Event ID'].nunique() if 'Event ID' in df.columns else 0,
        'date_range': (df['DateTime'].min(), df['DateTime'].max()) if 'DateTime' in df.columns else None,
        'stages': df['Stage'].value_counts().to_dict() if 'Stage' in df.columns else {},
        'defender_responses': df['DefenderResponse'].value_counts().to_dict() if 'DefenderResponse' in df.columns else {}
    }
    
    return summary


# Example usage
if __name__ == "__main__":
    # Example with sample data
    file = open('/Users/ethnsyrs/Documents/Data-Analytics-Capstone/data/unraveled-apt/host-logs/windows/10_1_2_21-windows-securityevents-user_labeled.csv', 'r', encoding='utf-8')
    # Parse sample data
    df = pd.read_csv(StringIO(str(file.readlines())))
    
    file.close()
    

KeyboardInterrupt: 

In [None]:
parsed_details = df['Activity'].apply(parse_event_details)
details_df = pd.DataFrame(parsed_details.tolist())
df_clean = pd.concat([df.drop('Activity', axis=1), details_df], axis=1)
print("Sample parsed data:")
print(df_clean[['Event ID', 'subject_account', 'process_name', 'Stage']].head())
print("\n--- To use with your actual file: ---")
print("df_clean = clean_unraveled_csv('path/to/your/file.csv', 'cleaned_output.csv')")
print("summary = extract_feature_summary(df_clean)")

Unnamed: 0,Keywords,Date and Time,Source,Event ID,Task Category,Stage,DefenderResponse,Signature,activity_description,subject_sid,subject_account,subject_domain,subject_logon_id,user_sid,user_account,user_domain,process_id,process_name
0,Audit Success,7/17/2021 10:03:54 PM,Microsoft-Windows-Security-Auditing,4798,User Account Management,,,,A user's local group membership was enumerated.,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x6BF90,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x1528,C:\Windows\System32\mmc.exe
1,Audit Success,7/17/2021 10:00:23 PM,Microsoft-Windows-Security-Auditing,4798,User Account Management,,,,A user's local group membership was enumerated.,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x6BF90,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x1528,C:\Windows\System32\mmc.exe
2,Audit Success,7/17/2021 10:00:23 PM,Microsoft-Windows-Security-Auditing,4798,User Account Management,,,,A user's local group membership was enumerated.,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x6BFEA,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x140c,C:\Windows\explorer.exe
3,Audit Success,7/17/2021 10:00:15 PM,Microsoft-Windows-Security-Auditing,4798,User Account Management,,,,A user's local group membership was enumerated.,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x6BF90,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x1528,C:\Windows\System32\mmc.exe
4,Audit Success,7/17/2021 9:59:07 PM,Microsoft-Windows-Security-Auditing,5379,User Account Management,,,,Credential Manager credentials were read.,DESKTOP-56DUI1B\ansible,ansible,DESKTOP-56DUI1B,0x6BFEA,,,,,


In [22]:
summary = extract_feature_summary(df_clean)