Hypothesis:

APTs adjust TTPs in response to defensive measures and signs of detection.

In [115]:
import pandas as pd
import numpy as np
import os
import re
import chardet

def get_encoding(path):
    with open(path, 'rb') as f:
        raw = f.read(4096)  # read first 4KB

        # Use chardet to detect the encoding
        result = chardet.detect(raw)

        return result['encoding']

def get_files_recurse(path):
    result = []
    for root, dirs, files in os.walk(path):
        for file in files:
            full = os.path.join(root, file)
            result.append(full)
            
    return result

def load_all_csv(path, sep=',', recurse=False, verbose=False):
    files = [path + x for x in os.listdir(path)] if not recurse else get_files_recurse(path)
    d = dict()
    
    for f in files:
        if verbose:
            print(f)
        
        enc = get_encoding(f)
        
        key = os.path.split(f)[1]
        d[key] = pd.read_csv(f, delimiter=sep, encoding=enc)
        
    df = pd.concat(d.values(), ignore_index=True)
    return df

# Network Logs

## Netflow

In [None]:
path = '../data/unraveled-apt/network-flows/'


    

In [120]:
df = load_all_csv('../data/unraveled-apt/network-flows/', recurse=True, verbose=True)

../data/unraveled-apt/network-flows/Week1_Day1-2_05262021-05272021/net1012x_Flow_labeled.csv
../data/unraveled-apt/network-flows/Week1_Day1-2_05262021-05272021/net1013x_Flow_labeled.csv


ParserError: Error tokenizing data. C error: Expected 89 fields in line 6990, saw 90


# Linux Host Logs

## `audit`

In [None]:
path = os.path.split(os.getcwd())[0] + '/data/unraveled-apt/host-logs/audit/'
audit_df = load_all_csv(path, sep=';')

In [72]:
audit_df.shape

(264320, 5)

On the last row, there appears to be some preceeding whitespace in the LogEvent column. Lets handle that:

In [73]:
for col in audit_df.columns:
    try:
        audit_df[col] = audit_df[col].str.strip()
    except:
        continue

We will need to reindex this DF since we combined a bunch of files, basically.

In [74]:
pat = re.compile(r"msg=('.*')")

In [75]:
msg_df = audit_df.LogEvent.str.extract(r"msg=('.*')")
no_msg = audit_df.LogEvent.str.replace(r"msg=('.*')", repl='', regex=True)

We are having to do some funky business to retain the `msg` field. If we were to split the output below on spaces or `=`, it would mangle the message and split it up in a less than ideal way for maintaining data integrity.

In [76]:
msg_df.iloc[5][0]

'\'op=PAM:session_close acct="root" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/5 res=success\''

Here is what the `no_msg` series looks like now. We can proceeed with converting this into a DataFrame.

In [77]:
print(no_msg.iloc[111])
print(audit_df.LogEvent.iloc[111])

type=USER_START ts=1621862701.432 tsid=600 pid=15765 uid=0 auid=0 ses=3575 
type=USER_START ts=1621862701.432 tsid=600 pid=15765 uid=0 auid=0 ses=3575 msg='op=PAM:session_open acct="root" exe="/usr/sbin/cron" hostname=? addr=? terminal=cron res=success'


In [78]:
logs = no_msg.str.split()

In [79]:
logs.iloc[logs.shape[0]-1]

['type=USER_START',
 'ts=1625992741.358',
 'tsid=78499',
 'pid=789481',
 'uid=0',
 'auid=1000',
 'ses=5754',
 'subj==unconfined',
 'UID="root"',
 'AUID="ubuntu"']

In [80]:
expand_logs = logs.apply(lambda x: {b[0]: b[1] for b in [a.split('=') for a in x]}).to_dict()
dict(list(expand_logs.items())[:2]) # logs are expanded to a dictionary of dictionaries

{0: {'type': 'DAEMON_START',
  'ts': '1621837767.969',
  'tsid': '9329',
  'op': 'start',
  'ver': '2.8.2',
  'format': 'raw',
  'kernel': '5.3.0-40-generic',
  'auid': '4294967295',
  'pid': '13687',
  'uid': '0',
  'ses': '4294967295',
  'subj': 'unconfined',
  'res': 'success'},
 1: {'type': 'CONFIG_CHANGE',
  'ts': '1621837767.983',
  'tsid': '489',
  'op': 'set',
  'audit_backlog_limit': '8192',
  'old': '64',
  'auid': '4294967295',
  'ses': '4294967295',
  'res': '1'}}

In [81]:
log_df = pd.DataFrame(expand_logs).T
print(log_df.head(6))

del expand_logs

            type              ts  tsid     op    ver format            kernel  \
0   DAEMON_START  1621837767.969  9329  start  2.8.2    raw  5.3.0-40-generic   
1  CONFIG_CHANGE  1621837767.983   489    set    NaN    NaN               NaN   
2  CONFIG_CHANGE  1621837767.983   490    set    NaN    NaN               NaN   
3  CONFIG_CHANGE  1621837767.983   491    set    NaN    NaN               NaN   
4  SERVICE_START  1621837767.987   492    NaN    NaN    NaN               NaN   
5       USER_END  1621837780.539   493    NaN    NaN    NaN               NaN   

         auid    pid  uid  ...  sig  dev prom old_prom AUID  UID OLD-AUID  \
0  4294967295  13687    0  ...  NaN  NaN  NaN      NaN  NaN  NaN      NaN   
1  4294967295    NaN  NaN  ...  NaN  NaN  NaN      NaN  NaN  NaN      NaN   
2  4294967295    NaN  NaN  ...  NaN  NaN  NaN      NaN  NaN  NaN      NaN   
3  4294967295    NaN  NaN  ...  NaN  NaN  NaN      NaN  NaN  NaN      NaN   
4  4294967295      1    0  ...  NaN  NaN  NaN  

In [82]:
log_df.columns  # no msg column

Index(['type', 'ts', 'tsid', 'op', 'ver', 'format', 'kernel', 'auid', 'pid',
       'uid', 'ses', 'subj', 'res', 'audit_backlog_limit', 'old',
       'audit_failure', 'audit_backlog_wait_time', 'old-auid', 'tty',
       'old-ses', 'apparmor', 'operation', 'profile', 'name', 'comm',
       'requested_mask', 'denied_mask', 'fsuid', 'ouid', 'gid', 'exe', 'sig',
       'dev', 'prom', 'old_prom', 'AUID', 'UID', 'OLD-AUID', 'ID', 'GID',
       'info'],
      dtype='object')

In [83]:
log_df['ts'] = pd.to_datetime(log_df['ts'].str.replace('.', ''), unit='ms')
log_df['ts'].head()

  log_df['ts'] = pd.to_datetime(log_df['ts'].str.replace('.', ''), unit='ms')


0   2021-05-24 06:29:27.969
1   2021-05-24 06:29:27.983
2   2021-05-24 06:29:27.983
3   2021-05-24 06:29:27.983
4   2021-05-24 06:29:27.987
Name: ts, dtype: datetime64[ns]

In [84]:
# Create DataFrame of labeled audit log data
labeled_audit_df = pd.concat([
        msg_df,  # contains the retained msg field
        log_df,  # contains the rest of the log, parsed
        audit_df[audit_df.columns[1:]]  # slice off first column, since we just expanded that.
    ], 
    axis=1)

labeled_audit_df.rename({0: 'msg'}, inplace=True, axis=1)

In [85]:
# reordering the columns to put the msg field in position 11
labeled_audit_df = labeled_audit_df[labeled_audit_df.columns[1:].insert(11, 'msg')]

In [86]:
labeled_audit_df.to_csv('../data/cleaned/audit.csv', index=False)

## `auth`

In [2]:
path = os.path.split(os.getcwd())[0] + '/data/unraveled-apt/host-logs/auth/'
auth_df = load_all_csv(path, sep='|')

In [3]:
for col in auth_df.columns[1:]:
    print(auth_df[col].value_counts(), end=f'\n{'-'*20}\n')

Activity
Normal                       89135
Network Service Discovery       38
Maintain Access                 36
Name: count, dtype: int64
--------------------
Stage
Benign              89135
Lateral Movement       74
Name: count, dtype: int64
--------------------
DefenderResponse
Benign    89209
Name: count, dtype: int64
--------------------
Signature
APT    74
Name: count, dtype: int64
--------------------


In [5]:
auth_df.LogEvent.iloc[[5, 10, 15, 20, 25, 100, 200, 300, 1000, 2000]].values

array(['Jun 13 00:15:01 kali CRON[328966]: pam_unix(cron:session): session closed for user root',
       'Jun 13 00:35:01 kali CRON[329034]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 00:45:01 kali CRON[329086]: pam_unix(cron:session): session closed for user root',
       'Jun 13 01:09:01 kali CRON[329103]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 01:17:01 kali CRON[329153]: pam_unix(cron:session): session closed for user root',
       'Jun 13 05:17:01 kali CRON[329689]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 10:39:01 kali CRON[330494]: pam_unix(cron:session): session opened for user root by (uid=0)',
       'Jun 13 16:15:01 kali CRON[331321]: pam_unix(cron:session): session opened for user root by (uid=0)',
       "Jun 14 11:30:10 kali sshd[336289]: lastlog_openseek: Couldn't stat /var/log/lastlog: No such file or directory",
       'Jun 15 23:09:01 kali CRON[390994

In [29]:
logs = auth_df.LogEvent.apply(lambda x: x.split(' ', maxsplit=5))
logs.head().values

array([list(['Jun', '13', '00:05:01', 'kali', 'CRON[328914]:', 'pam_unix(cron:session): session opened for user root by (uid=0)']),
       list(['Jun', '13', '00:05:01', 'kali', 'CRON[328914]:', 'pam_unix(cron:session): session closed for user root']),
       list(['Jun', '13', '00:09:01', 'kali', 'CRON[328918]:', 'pam_unix(cron:session): session opened for user root by (uid=0)']),
       list(['Jun', '13', '00:09:01', 'kali', 'CRON[328918]:', 'pam_unix(cron:session): session closed for user root']),
       list(['Jun', '13', '00:15:01', 'kali', 'CRON[328966]:', 'pam_unix(cron:session): session opened for user root by (uid=0)'])],
      dtype=object)

In [30]:
df = pd.DataFrame(data=logs.tolist(), columns=['month', 'day', 'time', 'hostname', 'app', 'msg'])


In [31]:
df['ts'] = "2021-"+df['month']+"-"+df['day']+" "+df['time']
df['ts'] = pd.to_datetime(df['ts'])

In [32]:
# Drop redundant date cols and make ts col 0
df.drop(['month', 'day', 'time'], axis=1, inplace=True, errors='ignore')
df = df[df.columns[:-1].insert(0, 'ts')]

In [33]:
df.head()

Unnamed: 0,ts,hostname,app,msg
0,2021-06-13 00:05:01,kali,CRON[328914]:,pam_unix(cron:session): session opened for use...
1,2021-06-13 00:05:01,kali,CRON[328914]:,pam_unix(cron:session): session closed for use...
2,2021-06-13 00:09:01,kali,CRON[328918]:,pam_unix(cron:session): session opened for use...
3,2021-06-13 00:09:01,kali,CRON[328918]:,pam_unix(cron:session): session closed for use...
4,2021-06-13 00:15:01,kali,CRON[328966]:,pam_unix(cron:session): session opened for use...


In [34]:
tmp = df['app'].str.split('[')

In [35]:
tmp = tmp.apply(lambda x: [e.strip(']:') for e in x])
tmp = tmp.apply(lambda x: x+[0] if len(x) == 1 else x)

In [36]:
tmp

0        [CRON, 328914]
1        [CRON, 328914]
2        [CRON, 328918]
3        [CRON, 328918]
4        [CRON, 328966]
              ...      
89204         [sudo, 0]
89205         [sudo, 0]
89206         [sudo, 0]
89207         [sudo, 0]
89208         [sudo, 0]
Name: app, Length: 89209, dtype: object

In [37]:
tmp = pd.DataFrame(tmp.tolist(), columns=['app','pid'])

In [38]:
tmp.head()

Unnamed: 0,app,pid
0,CRON,328914
1,CRON,328914
2,CRON,328918
3,CRON,328918
4,CRON,328966


In [39]:
df['app'] = tmp['app']
df['pid'] = tmp['pid']


In [62]:
df = df[['ts','hostname','app','pid','msg']]
df.head()

Unnamed: 0,ts,hostname,app,pid,msg
0,2021-06-13 00:05:01,kali,CRON,328914,pam_unix(cron:session): session opened for use...
1,2021-06-13 00:05:01,kali,CRON,328914,pam_unix(cron:session): session closed for use...
2,2021-06-13 00:09:01,kali,CRON,328918,pam_unix(cron:session): session opened for use...
3,2021-06-13 00:09:01,kali,CRON,328918,pam_unix(cron:session): session closed for use...
4,2021-06-13 00:15:01,kali,CRON,328966,pam_unix(cron:session): session opened for use...


In [18]:
del tmp

In [60]:
df.msg = df.msg.apply(lambda x: x.strip())

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89209 entries, 0 to 89208
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ts        89209 non-null  datetime64[ns]
 1   hostname  89209 non-null  category      
 2   app       89209 non-null  category      
 3   pid       89209 non-null  int64         
 4   msg       89209 non-null  object        
dtypes: category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 2.2+ MB


In [41]:
for col in ['hostname', 'app']:
    df[col] = df[col].astype('category')
    
df['pid'] = df['pid'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89209 entries, 0 to 89208
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ts        89209 non-null  datetime64[ns]
 1   hostname  89209 non-null  category      
 2   app       89209 non-null  category      
 3   pid       89209 non-null  int64         
 4   msg       89209 non-null  object        
dtypes: category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 2.2+ MB


In [65]:
df.to_csv('../data/cleaned/auth.csv', index=False)

## Combined

In [47]:
combined_linux_host_df = pd.concat([audit_df, auth_df], ignore_index=True)

In [57]:
combined_linux_host_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353529 entries, 0 to 353528
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   LogEvent          353529 non-null  object
 1   Activity          353529 non-null  object
 2   Stage             353529 non-null  object
 3   DefenderResponse  353529 non-null  object
 4   Signature         74 non-null      object
dtypes: object(5)
memory usage: 13.5+ MB


-----

# Windows Host Logs

## Security.evtx