Hypothesis:

APTs adjust TTPs in response to defensive measures and signs of detection.

# Linux Host Logs

## `audit`

In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
path = os.path.split(os.getcwd())[0] + '/data/unraveled-apt/host-logs/audit/'
files = [path + x for x in os.listdir(path)]
d = dict()

In [3]:
for f in files:
    key = os.path.split(f)[1]
    d[key] = pd.read_csv(f, delimiter=';')
    
audit_df = pd.concat(d.values(), ignore_index=True)

In [4]:
del d

In [5]:
audit_df.shape

(264320, 5)

On the last row, there appears to be some preceeding whitespace in the LogEvent column. Lets handle that:

In [6]:
for col in audit_df.columns:
    try:
        audit_df[col] = audit_df[col].str.strip()
    except:
        continue

We will need to reindex this DF since we combined a bunch of files, basically.

In [7]:
pat = re.compile(r"msg=('.*')")

In [8]:
msg_df = audit_df.LogEvent.str.extract(r"msg=('.*')")
no_msg = audit_df.LogEvent.str.replace(r"msg=('.*')", repl='', regex=True)

We are having to do some funky business to retain the `msg` field. If we were to split the output below on spaces or `=`, it would mangle the message and split it up in a less than ideal way for maintaining data integrity.

In [9]:
msg_df.iloc[5][0]

'\'op=PAM:session_close acct="root" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/5 res=success\''

Here is what the `no_msg` series looks like now. We can proceeed with converting this into a DataFrame.

In [10]:
print(no_msg.iloc[111])
print(audit_df.LogEvent.iloc[111])

type=USER_START ts=1621862701.432 tsid=600 pid=15765 uid=0 auid=0 ses=3575 
type=USER_START ts=1621862701.432 tsid=600 pid=15765 uid=0 auid=0 ses=3575 msg='op=PAM:session_open acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success'


In [11]:
logs = no_msg.str.split()

In [12]:
logs.iloc[logs.shape[0]-1]

['type=USER_START',
 'ts=1625992741.358',
 'tsid=78499',
 'pid=789481',
 'uid=0',
 'auid=1000',
 'ses=5754',
 'subj==unconfined',
 'UID="root"',
 'AUID="ubuntu"']

In [26]:
expand_logs = logs.apply(lambda x: {b[0]: b[1] for b in [a.split('=') for a in x]}).to_dict()
dict(list(expand_logs.items())[:2]) # logs are expanded to a dictionary of dictionaries

{0: {'type': 'DAEMON_START',
  'ts': '1621837767.969',
  'tsid': '9329',
  'op': 'start',
  'ver': '2.8.2',
  'format': 'raw',
  'kernel': '5.3.0-40-generic',
  'auid': '4294967295',
  'pid': '13687',
  'uid': '0',
  'ses': '4294967295',
  'subj': 'unconfined',
  'res': 'success'},
 1: {'type': 'CONFIG_CHANGE',
  'ts': '1621837767.983',
  'tsid': '489',
  'op': 'set',
  'audit_backlog_limit': '8192',
  'old': '64',
  'auid': '4294967295',
  'ses': '4294967295',
  'res': '1'}}

In [None]:
log_df = pd.DataFrame(expand_logs).T
print(log_df.head(6))

del expand_logs

In [28]:
log_df.columns  # no msg column

Index(['type', 'ts', 'tsid', 'op', 'ver', 'format', 'kernel', 'auid', 'pid',
       'uid', 'ses', 'subj', 'res', 'audit_backlog_limit', 'old',
       'audit_failure', 'audit_backlog_wait_time', 'old-auid', 'tty',
       'old-ses', 'apparmor', 'operation', 'profile', 'name', 'comm',
       'requested_mask', 'denied_mask', 'fsuid', 'ouid', 'gid', 'exe', 'sig',
       'dev', 'prom', 'old_prom', 'AUID', 'UID', 'OLD-AUID', 'ID', 'GID',
       'info'],
      dtype='object')

In [None]:
log_df['tts'] = pd.to_datetime(log_df['ts'].str.replace('.', ''), unit='ms')
log_df.tts.head()

  log_df['tts'] = pd.to_datetime(log_df['ts'].str.replace('.', ''), unit='ms')


In [17]:
# Create DataFrame of labeled audit log data
labeled_audit_df = pd.concat([
        msg_df,  # contains the retained msg field
        log_df,  # contains the rest of the log, parsed
        audit_df[audit_df.columns[1:]]  # slice off first column, since we just expanded that.
    ], 
    axis=1)

labeled_audit_df.rename({0: 'msg'}, inplace=True, axis=1)

In [18]:
# reordering the columns to put the msg field in position 11
labeled_audit_df = labeled_audit_df[labeled_audit_df.columns[1:].insert(11, 'msg')]

In [19]:
labeled_audit_df.head()

Unnamed: 0,type,ts,tsid,op,ver,format,kernel,auid,pid,uid,...,AUID,UID,OLD-AUID,ID,GID,info,Activity,Stage,DefenderResponse,Signature
0,DAEMON_START,1621837767.969,9329,start,2.8.2,raw,5.3.0-40-generic,4294967295,13687.0,0.0,...,,,,,,,Normal,Benign,Benign,
1,CONFIG_CHANGE,1621837767.983,489,set,,,,4294967295,,,...,,,,,,,Normal,Benign,Benign,
2,CONFIG_CHANGE,1621837767.983,490,set,,,,4294967295,,,...,,,,,,,Normal,Benign,Benign,
3,CONFIG_CHANGE,1621837767.983,491,set,,,,4294967295,,,...,,,,,,,Normal,Benign,Benign,
4,SERVICE_START,1621837767.987,492,,,,,4294967295,1.0,0.0,...,,,,,,,Normal,Benign,Benign,


In [None]:
labeled_audit_df.to_csv('whereami.csv')

## `auth`

In [20]:
path = os.path.split(os.getcwd())[0] + '/data/unraveled-apt/host-logs/auth/'
files = [path + x for x in os.listdir(path)]
d = dict()

In [24]:
for f in files:
    key = os.path.split(f)[1]
    d[key] = pd.read_csv(f, delimiter='|')
    
auth_df = pd.concat(d.values(), ignore_index=True)

In [50]:
for col in auth_df.columns[1:]:
    print(auth_df[col].value_counts(), end=f'\n{'-'*20}\n')

Activity
Normal                       89135
Network Service Discovery       38
Maintain Access                 36
Name: count, dtype: int64
--------------------
Stage
Benign              89135
Lateral Movement       74
Name: count, dtype: int64
--------------------
DefenderResponse
Benign    89209
Name: count, dtype: int64
--------------------
Signature
APT    74
Name: count, dtype: int64
--------------------


In [56]:
auth_df.LogEvent.iloc[[5, 10, 15, 20, 25, 100, 200, 300, 1000, 2000]].values

array(['Jun 13 00:15:01 kali CRON[328966]: pam_unix(cron:session): session closed for user root',
       'Jun 13 00:35:01 kali CRON[329034]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 00:45:01 kali CRON[329086]: pam_unix(cron:session): session closed for user root',
       'Jun 13 01:09:01 kali CRON[329103]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 01:17:01 kali CRON[329153]: pam_unix(cron:session): session closed for user root',
       'Jun 13 05:17:01 kali CRON[329689]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 10:39:01 kali CRON[330494]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 16:15:01 kali CRON[331321]: pam_unix(cron:session): session opened for user root by (uid=0)',
       "Jun 14 11:30:10 kali sshd[336289]: lastlog_openseek: Couldn't stat /var/log/lastlog: No such file or directory",
       'Jun 15 23:09:01 kali CRON[390994

In [66]:
logs = auth_df.LogEvent.apply(lambda x: x.split(' ', maxsplit=6))
logs.head().values

array([list(['Jun', '13', '00:05:01', 'kali', 'CRON[328914]:', 'pam_unix(cron:session):', 'session opened for user root by (uid=0)']),
       list(['Jun', '13', '00:05:01', 'kali', 'CRON[328914]:', 'pam_unix(cron:session):', 'session closed for user root']),
       list(['Jun', '13', '00:09:01', 'kali', 'CRON[328918]:', 'pam_unix(cron:session):', 'session opened for user root by (uid=0)']),
       list(['Jun', '13', '00:09:01', 'kali', 'CRON[328918]:', 'pam_unix(cron:session):', 'session closed for user root']),
       list(['Jun', '13', '00:15:01', 'kali', 'CRON[328966]:', 'pam_unix(cron:session):', 'session opened for user root by (uid=0)'])],
      dtype=object)

In [75]:
df = pd.DataFrame(data=logs.tolist(), columns=['month', 'day', 'time', 'hostname', 'app', 'provider', 'msg'])
df.provider

0        pam_unix(cron:session):
1        pam_unix(cron:session):
2        pam_unix(cron:session):
3        pam_unix(cron:session):
4        pam_unix(cron:session):
                  ...           
89204    pam_unix(sudo:session):
89205                           
89206                           
89207                           
89208    pam_unix(sudo:session):
Name: provider, Length: 89209, dtype: object

## Combined

In [47]:
combined_linux_host_df = pd.concat([audit_df, auth_df], ignore_index=True)

In [57]:
combined_linux_host_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353529 entries, 0 to 353528
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   LogEvent          353529 non-null  object
 1   Activity          353529 non-null  object
 2   Stage             353529 non-null  object
 3   DefenderResponse  353529 non-null  object
 4   Signature         74 non-null      object
dtypes: object(5)
memory usage: 13.5+ MB


-----

# Windows Host Logs

## Security.evtx