# Exploratory Analysis

## Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

%matplotlib inline

proj_dir = os.path.split(os.getcwd())[0]

In [None]:
proj_dir

In [None]:
audit_df = pd.read_pickle(f'{proj_dir}/data/cleaned/audit.pkl')
auth_df = pd.read_pickle(f'{proj_dir}/data/cleaned/auth.pkl')
net_df = pd.read_pickle(f'{proj_dir}/data/cleaned/netflow.pkl')
win_df = pd.read_pickle(f'{proj_dir}/data/cleaned/win-security.pkl')

## Host Logs

### `audit` Logs

With these audit logs, I want to review the following:
- Distribution of unique values for the categorical features.

I also have the following concerns:
- Lots of columns are mostly null values.
- Lots of rows have a null value
- Apply some sort of feature reduction, otherwise a classifier will take ages to get through it.

After we play around, I'll talk about what's next for this DataFrame.

In [None]:
audit_df.head()

In [None]:
audit_df.info(memory_usage='deep')

In [None]:
nunique_sorted = audit_df.nunique().sort_values(ascending=True)
nunique_sorted

In [None]:
cols = nunique_sorted[:-5].index
len(cols)

In [None]:
fig, axes = plt.subplots(2, 5, figsize=(20,8))
axes = axes.flatten()

for i, col in enumerate(cols):
    audit_df[col].value_counts().plot(kind='bar', ax=axes[i])
    axes[i].set_title(col, fontweight='bold')

plt.tight_layout()
plt.show()

Alright, so the plan is:
- Select the best performing classification model.
- Fit a classification model built on a majority of the features, then apply feature reduction techniques.
- Pit the old model against the new to validate performance.

### `auth` Logs

I'm interested in the following:
- What are some common features among benign logs?
  - Successes and failures
- What about for the logs of malicious activity?


In [None]:
auth_df.head()

In [None]:
auth_df.info(memory_usage='deep')

### Windows `Security.evtx` Logs

Here are some things I am curious about:
- Frequency of given event IDs over the duration of the capture

In [None]:
win_df.head()

In [None]:
win_df.info(memory_usage='deep')

In [None]:
categories = win_df.columns[win_df.nunique() < 100]
win_df[categories] = win_df[categories].astype('category')

In [None]:
nunique_sorted = win_df.nunique().sort_values(ascending=True)
nunique_sorted

In [None]:
cols = ['DefenderResponse', 'Type', 'Activity', 'Source', 'Signature', 'Stage', 'TaskCategory', 'EventID']
len(cols)

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(20,10))
axes = axes.flatten()

for i, col in enumerate(cols):
    win_df[col].value_counts().plot(kind='bar', ax=axes[i])
    axes[i].set_title(col, fontweight='bold')

plt.tight_layout()
plt.show()

## Network Logs

### Netflow

#### All Traffic

In [None]:
net_df.head()

In [None]:
net_df.info(memory_usage='deep')

In [None]:
plotme = [
    ('src_ip', net_df.src_ip.value_counts().sort_values(ascending=False).head(5)),
    ('src_mac', net_df.src_mac.value_counts().sort_values(ascending=False).head(5)),
    ('src_port', net_df.src_port.value_counts().sort_values(ascending=False).head(5)),
    ('dst_port', net_df.dst_port.value_counts().sort_values(ascending=False).head(5)),
    ('dst_ip', net_df.dst_ip.value_counts().sort_values(ascending=False).head(5)),
    ('dst_mac', net_df.dst_mac.value_counts().sort_values(ascending=False).head(5)),
    ('application_name', net_df.application_name.value_counts().sort_values(ascending=False).head(5)),
    ('application_category_name', net_df.application_category_name.value_counts().sort_values(ascending=False).head(5)),
    ('server_fingerprint', net_df.server_fingerprint.value_counts().sort_values(ascending=False).head(5)),
    ('user_agent', net_df.user_agent.value_counts().sort_values(ascending=False).head(5)),
    ('Activity', net_df.Activity.value_counts().sort_values(ascending=False).head(5)),
    ('Stage', net_df.Stage.value_counts().sort_values(ascending=False).head(5)),
    ('DefenderResponse', net_df.DefenderResponse.value_counts().sort_values(ascending=False).head(5)),
    ('Signature', net_df.Signature.value_counts().sort_values(ascending=False).head(5))
]

In [None]:
len(plotme)

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(20,14))
axes = axes.flatten()

for i, (title, data) in enumerate(plotme):
    data.plot(kind='barh', ax=axes[i], color='grey')
    axes[i].set_title(title, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
nunique_sorted = net_df.nunique().sort_values(ascending=True)


In [None]:
cols = net_df[nunique_sorted[(nunique_sorted < 25)].index].columns
print(len(cols))

df = net_df[cols].sample(10)

#### Malicious Traffic

In [None]:
bad_df = net_df.query('Activity != "Normal"')

In [None]:
plotme = [
    ('src_ip', bad_df.src_ip.value_counts().sort_values(ascending=False).head(5)),
    ('src_mac', bad_df.src_mac.value_counts().sort_values(ascending=False).head(5)),
    ('src_port', bad_df.src_port.value_counts().sort_values(ascending=False).head(5)),
    ('dst_port', bad_df.dst_port.value_counts().sort_values(ascending=False).head(5)),
    ('dst_ip', bad_df.dst_ip.value_counts().sort_values(ascending=False).head(5)),
    ('dst_mac', bad_df.dst_mac.value_counts().sort_values(ascending=False).head(5)),
    ('application_name', bad_df.application_name.value_counts().sort_values(ascending=False).head(5)),
    ('application_category_name', bad_df.application_category_name.value_counts().sort_values(ascending=False).head(5)),
    ('server_fingerprint', bad_df.server_fingerprint.value_counts().sort_values(ascending=False).head(5)),
    ('user_agent', bad_df.user_agent.value_counts().sort_values(ascending=False).head(5)),
    ('Activity', bad_df.Activity.value_counts().sort_values(ascending=False).head(5)),
    ('Stage', bad_df.Stage.value_counts().sort_values(ascending=False).head(5)),
    ('DefenderResponse', bad_df.DefenderResponse.value_counts().sort_values(ascending=False).head(5)),
    ('Signature', bad_df.Signature.value_counts().sort_values(ascending=False).head(5))
]

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(20,14))
axes = axes.flatten()

for i, (title, data) in enumerate(plotme):
    data.plot(kind='barh', ax=axes[i], color='darkred')
    axes[i].set_title(title, fontweight='bold')

plt.tight_layout()
plt.show()

#### Detected Traffic

In [None]:
found_df = net_df.query('DefenderResponse == "Detected"')

In [None]:
plotme = [
    ('src_ip', found_df.src_ip.value_counts().sort_values(ascending=False).head(5)),
    ('src_mac', found_df.src_mac.value_counts().sort_values(ascending=False).head(5)),
    ('src_port', found_df.src_port.value_counts().sort_values(ascending=False).head(5)),
    ('dst_port', found_df.dst_port.value_counts().sort_values(ascending=False).head(5)),
    ('dst_ip', found_df.dst_ip.value_counts().sort_values(ascending=False).head(5)),
    ('dst_mac', found_df.dst_mac.value_counts().sort_values(ascending=False).head(5)),
    ('application_name', found_df.application_name.value_counts().sort_values(ascending=False).head(5)),
    ('application_category_name', found_df.application_category_name.value_counts().sort_values(ascending=False).head(5)),
    ('server_fingerprint', found_df.server_fingerprint.value_counts().sort_values(ascending=False).head(5)),
    ('user_agent', found_df.user_agent.value_counts().sort_values(ascending=False).head(5)),
    ('Activity', found_df.Activity.value_counts().sort_values(ascending=False).head(5)),
    ('Stage', found_df.Stage.value_counts().sort_values(ascending=False).head(5)),
    ('Signature', found_df.Signature.value_counts().sort_values(ascending=False).head(5))
]

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(20,14))
axes = axes.flatten()

for i, (title, data) in enumerate(plotme):
    data.plot(kind='barh', ax=axes[i], color='darkgreen')
    axes[i].set_title(title, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
del found_df, bad_df

## Reviewing Detected Activity

In [None]:
found = {
    "netflow": net_df.query('DefenderResponse == "Detected"'),
    "audit": audit_df.query('DefenderResponse == "Detected"'),
    "auth": auth_df.query('DefenderResponse == "Detected"'),
    "evtx": win_df.query('DefenderResponse == "Detected"')
}

It looks like only malicious network traffic was detected. Awkward.

In [None]:
print("Detected Actions:")

for k, v in found.items():
    print(f'  {k}: {v.shape[0]}')

## Reviewing Undetected Activity

In [None]:
print(net_df.DefenderResponse.value_counts())
print(auth_df.DefenderResponse.value_counts())
print(audit_df.DefenderResponse.value_counts())
print(win_df.DefenderResponse.value_counts())

In [None]:
stealthy = {
    "netflow": net_df.query('DefenderResponse != "Detected" and Activity != "Normal"'),
    #"audit": audit_df.query('DefenderResponse != "Detected" and Activity != "Normal"'),  # not even going to waste the space, there are no results
    "auth": auth_df.query('DefenderResponse != "Detected" and Activity != "Normal"'),
    "evtx": win_df.query('DefenderResponse != "Detected" and Activity != "Normal"')
}

bad = {
    "netflow": net_df.query("Activity != 'Normal'").shape[0],
    "auth": auth_df.query("Activity != 'Normal'").shape[0],
    "evtx": win_df.query("Activity != 'Normal'").shape[0]
}

In [None]:
print("Undetected Malicious Actions:\n=============================")

for k, v in stealthy.items():
    stealth_rate = (v.shape[0]/bad[k]) * 100
    print(f'  {k}: {v.shape[0]} of {bad[k]}\n\tundetected {stealth_rate:.1f}% of time\n')