# Imports and Definitions

In [2]:
# imports
import pandas as pd
import numpy as np
import scipy
import os.path
import json
import collections

In [3]:
# read file lines into a list
def get_lines_from_file(filename):
    with open(filename) as f:
        return [line.strip() for line in f]

# display all rows in a dataframe
def displaydf(df):
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        display(df)

# Load Data

In [5]:
# Edit the data filepath accordingly to read in the input files
data_filepath = '../data'

In [6]:
techniques = pd.read_csv(os.path.join(data_filepath, "techniques.csv"))

In [7]:
def parse_threat_column(threat):
    if type(threat) != float:
        threat_list = json.loads(threat.replace("'", '"'))
        threat_techniques = set()
        for threat in threat_list:
            techniques = threat.get('technique', [])
            for technique in techniques:
                threat_techniques.add(technique['id'])
        if threat_techniques:
            return list(threat_techniques)

    return np.NaN

def load_ids(x):
    if type(x) == str:
        return json.loads(x.replace('\'', '"'))
    return x

def drop_subtechniques(x):
    if type(x) == float:
        return x
    techniques = set()
    for t in x:
        techniques.add(t[0:5])
    return techniques

def map_tactic(technique):
    try:
        tactic = techniques[techniques['technique'] == technique]['tactics'].drop_duplicates()
        return tactic.values[0]
    except:
        print("error on technique", technique)
        return "unknown"

def get_sigma_techniques(tags):
    if type(tags) == float:
        return []
    rule_techniques = set()
    for tag in tags:
        if tag.startswith('attack.t'):
            technique = tag[7:12].upper()
            if technique in set(techniques.technique.drop_duplicates()):
                rule_techniques.add(technique.upper())
    return list(rule_techniques)


In [8]:
splunk = pd.read_csv(os.path.join(data_filepath, "splunk_rules.csv"))
splunk['rule_index'] = splunk['rule_index'].apply(lambda x: 'splunk' + str(x))
splunk['mitre_attack_id'] = splunk['tags.mitre_attack_id'].apply(load_ids).apply(drop_subtechniques).apply(lambda x: list(x) if type(x) == set else [])
splunk = splunk[splunk['mitre_attack_id'].apply(lambda x: len(x) > 0)]

elastic = pd.read_csv(os.path.join(data_filepath, "elastic_rules.csv"))
elastic = elastic[elastic['metadata.maturity'] == 'production']
elastic['rule_index'] = elastic['rule_index'].apply(lambda x: 'elastic' + str(x))
elastic['mitre_attack_id'] = elastic['rule.threat'].apply(parse_threat_column)
elastic = elastic[pd.notna(elastic['mitre_attack_id'])]

sigma = pd.read_csv(os.path.join(data_filepath, "sigma_rules.csv"))
sigma = sigma.rename({'Unnamed: 0': 'rule_index'}, axis=1)
sigma['rule_index'] = sigma['rule_index'].apply(lambda x: 'sigma' + str(x))
sigma = sigma[sigma['status'].apply(lambda x: x == 'experimental' or x == 'test' or x == 'stable')]
sigma['mitre_attack_id'] = sigma['tags'].apply(lambda tags: get_sigma_techniques(load_ids(tags)))
sigma = sigma[sigma['mitre_attack_id'].apply(lambda x: len(x) > 0)]

In [9]:
tactic_order = ['reconnaissance', 'resource-development', 'initial-access',
                'execution', 'persistence', 'privilege-escalation',
                'defense-evasion', 'credential-access', 'discovery',
                'lateral-movement', 'collection', 'command-and-control',
                'exfiltration', 'impact']

# Implemented Techniques Per Ruleset Across Tactics (Figures 1 & 7)

In [10]:
splunk_mitre_id_counts = {}
splunk_mitre_counts = {}
ids = splunk['mitre_attack_id']
ids_no_subtechniques = ids.apply(drop_subtechniques)
for row in ids:
    if type(row) != float:
        for technique in row:
            if technique not in splunk_mitre_id_counts:
                splunk_mitre_id_counts[technique] = 0
            splunk_mitre_id_counts[technique] += 1

for row in ids_no_subtechniques:
    if type(row) != float:
        for technique in row:
            if technique not in splunk_mitre_counts:
                splunk_mitre_counts[technique] = 0
            splunk_mitre_counts[technique] += 1

In [11]:
splunk_mitre_counts_df = pd.Series(splunk_mitre_counts, name='count').sort_values()
splunk_mitre_id_counts_df = pd.Series(splunk_mitre_id_counts, name='count').sort_values()

splunk_tactic_df = splunk_mitre_counts_df.reset_index().rename({'index': 'technique'}, axis=1)
splunk_tactic_df['tactic'] = splunk_tactic_df.technique.apply(map_tactic)
splunk_tactic_df

Unnamed: 0,technique,count,tactic
0,T1040,1,credential-access|discovery
1,T1187,1,credential-access
2,T1491,1,impact
3,T1647,1,defense-evasion
4,T1095,1,command-and-control
...,...,...,...
95,T1078,39,defense-evasion|persistence|privilege-escalati...
96,T1548,50,privilege-escalation|defense-evasion
97,T1059,63,execution
98,T1562,63,defense-evasion


In [12]:
elastic_mitre_counts = {}
for row in elastic['mitre_attack_id']:
    if type(row) == float:
        continue
    for technique in row:
        if technique not in elastic_mitre_counts:
            elastic_mitre_counts[technique] = 0
        elastic_mitre_counts[technique] += 1

In [13]:
elastic_mitre_counts_df = pd.Series(elastic_mitre_counts, name='count').sort_values()
elastic_mitre_counts_df

elastic_tactic_df = elastic_mitre_counts_df.reset_index().rename({'index': 'technique'}, axis=1)
elastic_tactic_df['tactic'] = elastic_tactic_df.technique.apply(map_tactic)
elastic_tactic_df

Unnamed: 0,technique,count,tactic
0,T1102,1,command-and-control
1,T1550,1,defense-evasion|lateral-movement
2,T1006,1,defense-evasion
3,T1497,1,defense-evasion|discovery
4,T1049,1,discovery
...,...,...,...
87,T1562,25,defense-evasion
88,T1543,26,persistence|privilege-escalation
89,T1003,31,credential-access
90,T1021,32,lateral-movement


In [14]:
sigma_mitre_counts = {}
for row in sigma.mitre_attack_id:
    for technique in row:
        if technique not in sigma_mitre_counts:
            sigma_mitre_counts[technique] = 0
        sigma_mitre_counts[technique] += 1

In [15]:
sigma_mitre_counts_df = pd.Series(sigma_mitre_counts, name='count').sort_values()
sigma_mitre_counts_df

sigma_tactic_df = sigma_mitre_counts_df.reset_index().rename({'index': 'technique'}, axis=1)
sigma_tactic_df['tactic'] = sigma_tactic_df.technique.apply(map_tactic)
sigma_tactic_df

Unnamed: 0,technique,count,tactic
0,T1221,1,defense-evasion
1,T1195,1,initial-access
2,T1497,1,defense-evasion|discovery
3,T1599,1,defense-evasion
4,T1010,1,discovery
...,...,...,...
146,T1027,99,defense-evasion
147,T1562,107,defense-evasion
148,T1003,122,credential-access
149,T1218,170,defense-evasion


In [16]:
splunk_technique_cnt_per_tactic_df = pd.Series(collections.Counter("|".join(splunk_tactic_df.tactic).split("|")))
elastic_technique_cnt_per_tactic_df = pd.Series(collections.Counter("|".join(elastic_tactic_df.tactic).split("|")))
sigma_technique_cnt_per_tactic_df = pd.Series(collections.Counter("|".join(sigma_tactic_df.tactic).split("|")))
compare_technique_cnt_per_tactic_df = pd.concat([splunk_technique_cnt_per_tactic_df, elastic_technique_cnt_per_tactic_df, sigma_technique_cnt_per_tactic_df],
                                                axis=1, names=['Splunk', 'Elastic', 'Sigma', 'Snort']).fillna(0).astype(int).rename({0: 'Splunk', 1: 'Elastic', 2: 'Sigma'}, axis=1)
displaydf(compare_technique_cnt_per_tactic_df)

compare_techniques_per_tactic = {}
for tactic in compare_technique_cnt_per_tactic_df.index:
    compare_techniques_per_tactic[tactic] = {}
    splunk_tactic_techniques = set(splunk_tactic_df[splunk_tactic_df.tactic.str.contains(tactic)].technique.values)
    elastic_tactic_techniques = set(elastic_tactic_df[elastic_tactic_df.tactic.str.contains(tactic)].technique.values)
    sigma_tactic_techniques = set(sigma_tactic_df[sigma_tactic_df.tactic.str.contains(tactic)].technique.values)
    print(tactic)
    print('Splunk:', splunk_tactic_techniques)
    print('Elastic:', elastic_tactic_techniques)
    print('Sigma:', sigma_tactic_techniques)
    print()
    compare_techniques_per_tactic[tactic]['splunk'] = splunk_tactic_techniques
    compare_techniques_per_tactic[tactic]['elastic'] = elastic_tactic_techniques
    compare_techniques_per_tactic[tactic]['sigma'] = sigma_tactic_techniques
    compare_techniques_per_tactic[tactic]['union'] = splunk_tactic_techniques.union(elastic_tactic_techniques).union(sigma_tactic_techniques)
    compare_techniques_per_tactic[tactic]['intersection'] = splunk_tactic_techniques.intersection(elastic_tactic_techniques).intersection(sigma_tactic_techniques)

compare_techniques_per_tactic_df = pd.DataFrame(compare_techniques_per_tactic).T
compare_techniques_per_tactic_df

compare_techniques_per_tactic_df.applymap(len)

Unnamed: 0,Splunk,Elastic,Sigma
credential-access,10,9,13
discovery,15,15,25
impact,10,3,12
defense-evasion,25,24,32
command-and-control,6,8,14
reconnaissance,4,0,4
exfiltration,4,2,6
execution,6,9,9
collection,6,6,13
privilege-escalation,11,12,12


credential-access
Splunk: {'T1558', 'T1187', 'T1003', 'T1212', 'T1040', 'T1552', 'T1621', 'T1555', 'T1110', 'T1556'}
Elastic: {'T1558', 'T1539', 'T1056', 'T1003', 'T1212', 'T1552', 'T1555', 'T1110', 'T1556'}
Sigma: {'T1557', 'T1558', 'T1187', 'T1539', 'T1056', 'T1003', 'T1212', 'T1040', 'T1555', 'T1552', 'T1528', 'T1110', 'T1556'}

discovery
Splunk: {'T1087', 'T1201', 'T1082', 'T1083', 'T1526', 'T1033', 'T1497', 'T1040', 'T1016', 'T1018', 'T1482', 'T1580', 'T1124', 'T1069', 'T1049'}
Elastic: {'T1087', 'T1135', 'T1120', 'T1614', 'T1518', 'T1082', 'T1018', 'T1069', 'T1033', 'T1497', 'T1016', 'T1482', 'T1057', 'T1049', 'T1046'}
Sigma: {'T1087', 'T1082', 'T1124', 'T1135', 'T1007', 'T1069', 'T1040', 'T1057', 'T1217', 'T1201', 'T1614', 'T1518', 'T1018', 'T1033', 'T1615', 'T1012', 'T1046', 'T1120', 'T1526', 'T1083', 'T1497', 'T1016', 'T1482', 'T1010', 'T1049'}

impact
Splunk: {'T1499', 'T1529', 'T1498', 'T1491', 'T1486', 'T1490', 'T1489', 'T1531', 'T1561', 'T1485'}
Elastic: {'T1490', 'T1565',

Unnamed: 0,splunk,elastic,sigma,union,intersection
credential-access,10,9,13,14,7
discovery,15,15,25,26,9
impact,10,3,12,13,2
defense-evasion,25,24,32,34,21
command-and-control,6,8,14,14,4
reconnaissance,4,0,4,4,0
exfiltration,4,2,6,6,2
execution,6,9,9,10,6
collection,6,6,13,14,4
privilege-escalation,11,12,12,12,11


The Splunk and Elastic data in the table above is used to generate Figure 1 in the paper. The Sigma data is incorporated in Figure 7.

# Rules Per Technique (Figure 2, 3, 8, & 9)

In [17]:
technique_coverage = pd.concat([splunk_mitre_counts_df, elastic_mitre_counts_df, sigma_mitre_counts_df], axis=1, keys=['Splunk', 'Elastic', 'Sigma']).reindex(techniques['technique'].drop_duplicates().values)
technique_coverage = technique_coverage.loc[list(set(technique_coverage.index).intersection(set(techniques['technique'].drop_duplicates().values)))]
technique_coverage = technique_coverage.fillna(0).astype(int).assign(sum=technique_coverage.sum(axis=1)).sort_values(by='sum')[['Splunk', 'Elastic', 'Sigma']]
technique_coverage

Unnamed: 0,Splunk,Elastic,Sigma
T1025,0,0,0
T1213,0,0,0
T1597,0,0,0
T1609,0,0,0
T1600,0,0,0
...,...,...,...
T1548,50,18,68
T1003,35,31,122
T1562,63,25,107
T1218,75,16,170


## ATT&CK Technique Density

Techniques with exactly one associated rule:

In [18]:
print(len(technique_coverage[(technique_coverage['Splunk'] == 1)]))
print(len(technique_coverage[(technique_coverage['Elastic'] == 1)]))
print(len(technique_coverage[(technique_coverage['Sigma'] == 1)]))

19
25
26


In [None]:
Techniques with 1 to 5 associated rules:

In [19]:
print(len(technique_coverage[(technique_coverage['Splunk'] <= 5)& (technique_coverage['Splunk'] > 0)]))
print(len(technique_coverage[(technique_coverage['Elastic'] <= 5) & (technique_coverage['Elastic'] > 0)]))
print(len(technique_coverage[(technique_coverage['Sigma'] <= 5) & (technique_coverage['Sigma'] > 0)]))

57
62
67


In [None]:
Techniques with more than 50 associated rules:

In [18]:
print(len(technique_coverage[(technique_coverage['Splunk'] >= 50)]))
print(len(technique_coverage[(technique_coverage['Elastic'] >= 50)]))
print(len(technique_coverage[(technique_coverage['Sigma'] >= 50)]))

4
0
17


In [19]:
def get_total_rules(ruleset_name):
    if ruleset_name == 'Elastic':
        return len(elastic)
    if ruleset_name == 'Splunk':
        return len(splunk)
    if ruleset_name == 'Sigma':
        return len(sigma)
    return 0

In [20]:
technique_coverage_percent = technique_coverage.apply(lambda col: col.apply(lambda x: x / get_total_rules(col.name)))
technique_coverage_percent = technique_coverage_percent.assign(sum=technique_coverage_percent.sum(axis=1)).sort_values(by='sum')[['Splunk', 'Elastic', 'Sigma']]
technique_coverage_percent

Unnamed: 0,Splunk,Elastic,Sigma
T1111,0.000000,0.000000,0.000000
T1011,0.000000,0.000000,0.000000
T1606,0.000000,0.000000,0.000000
T1609,0.000000,0.000000,0.000000
T1585,0.000000,0.000000,0.000000
...,...,...,...
T1548,0.054885,0.038055,0.030979
T1003,0.038419,0.065539,0.055581
T1562,0.069155,0.052854,0.048747
T1218,0.082327,0.033827,0.077449


In [None]:
Figures 2 and 8 contain the above table graphed as a CDF.
Figures 3 and 9 plot the top 10 techniques across the relevant rulesets.

## Consistency in Technique Ranking (Spearman coefficient)

In [None]:

print(scipy.stats.spearmanr(technique_coverage_percent['Splunk'].values, technique_coverage_percent['Elastic'].values))
print(scipy.stats.spearmanr(technique_coverage_percent['Splunk'].values, technique_coverage_percent['Sigma'].values))
print(scipy.stats.spearmanr(technique_coverage_percent['Elastic'].values, technique_coverage_percent['Sigma'].values))


# Techniques Per Rule (Figure 10)

In [21]:
rule_techniques = pd.concat([splunk[['rule_index', 'mitre_attack_id']],
                             elastic[['rule_index', 'mitre_attack_id']],
                             sigma[['rule_index', 'mitre_attack_id']]])
rule_techniques = rule_techniques[rule_techniques.mitre_attack_id.apply(lambda x: len(x) if type(x) == list else 0) > 0]
rules_to_num_tags = rule_techniques.set_index('rule_index')['mitre_attack_id'].apply(lambda x: len(x) if type(x) == list else 0)
rules_to_num_tags

rule_index
splunk0      1
splunk1      1
splunk2      1
splunk3      1
splunk4      1
            ..
sigma2484    1
sigma2497    3
sigma2498    3
sigma2504    1
sigma2513    1
Name: mitre_attack_id, Length: 3579, dtype: int64

In [22]:
rules_to_num_tags.value_counts()

1    3018
2     441
3      84
4      24
5       8
6       2
9       1
8       1
Name: mitre_attack_id, dtype: int64

In [23]:
pd.concat([(rules_to_num_tags[rules_to_num_tags.index.str.startswith('splunk')].value_counts() * 100 / len(rules_to_num_tags[rules_to_num_tags.index.str.startswith('splunk')])).rename('Splunk'),
           (rules_to_num_tags[rules_to_num_tags.index.str.startswith('elastic')].value_counts() * 100 / len(rules_to_num_tags[rules_to_num_tags.index.str.startswith('elastic')])).rename('Elastic'),
           (rules_to_num_tags[rules_to_num_tags.index.str.startswith('sigma')].value_counts() * 100 / len(rules_to_num_tags[rules_to_num_tags.index.str.startswith('sigma')])).rename('Sigma')],
           axis=1).sort_index().fillna(0).rename_axis('num_techniques_per_rule')

Unnamed: 0_level_0,Splunk,Elastic,Sigma
num_techniques_per_rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,91.877058,80.12685,82.095672
2,6.915477,17.547569,13.439636
3,1.097695,1.479915,3.052392
4,0.0,0.634249,0.95672
5,0.0,0.211416,0.318907
6,0.109769,0.0,0.045558
8,0.0,0.0,0.045558
9,0.0,0.0,0.045558


# Confidence, Risk, and Severity (Figures 4 & Supplementary Materials)


In [27]:
splunk_flattened_rows = []

ids = splunk['tags.mitre_attack_id'].apply(lambda x: load_ids(x))
ids_no_subtechniques = ids.apply(drop_subtechniques)
for i, row in enumerate(ids):
    try:
        if type(row) != float:
            for technique in row:
                flat_row = splunk.loc[i].copy()
                flat_row.loc['mitre_id'] = technique
                flat_row.loc['mitre_technique'] = technique[0:5]
                splunk_flattened_rows.append(flat_row)
    except:
        pass

splunk_flattened = pd.DataFrame(splunk_flattened_rows)

In [28]:
elastic_flattened_rows = []

ids = elastic['mitre_attack_id'].apply(lambda x: load_ids(x))
ids_no_subtechniques = ids.apply(drop_subtechniques)
for i in ids.index:
    row = ids.loc[i]
    if type(row) != float:
        for technique in row:
            flat_row = elastic.loc[i].copy()
            flat_row.loc['mitre_id'] = technique
            flat_row.loc['mitre_technique'] = technique[0:5]
            elastic_flattened_rows.append(flat_row)

elastic_flattened = pd.DataFrame(elastic_flattened_rows)

## Risk - Splunk, Elastic

In [29]:
top10_techniques = list(reversed(technique_coverage_percent.index.values[-10:]))
data = pd.concat([splunk_flattened[['mitre_technique', 'tags.risk_score']].assign(ruleset=splunk_flattened['mitre_technique'].apply(lambda x: 'Splunk')).rename({'tags.risk_score': 'risk_score'}, axis=1),
                  elastic_flattened[['mitre_technique', 'rule.risk_score']].assign(ruleset=elastic_flattened['mitre_technique'].apply(lambda x: 'Elastic')).rename({'rule.risk_score': 'risk_score'}, axis=1)])
data = data[data['mitre_technique'].apply(lambda x: x in top10_techniques)].set_index('mitre_technique').loc[top10_techniques].reset_index()
data

Unnamed: 0,mitre_technique,risk_score,ruleset
0,T1059,20.0,Splunk
1,T1059,27.0,Splunk
2,T1059,20.0,Splunk
3,T1059,20.0,Splunk
4,T1059,25.0,Splunk
...,...,...,...
912,T1547,47.0,Elastic
913,T1547,73.0,Elastic
914,T1547,47.0,Elastic
915,T1547,47.0,Elastic


## Severity - Elastic

In [30]:
top10_techniques = list(reversed(technique_coverage_percent.index.values[-10:]))
data = elastic_flattened[['mitre_technique', 'rule.severity']].assign(ruleset=elastic_flattened['mitre_technique'].apply(lambda x: 'Elastic')).\
                  replace(to_replace={'critical': 9.9, 'high': 7.3, 'medium': 4.7, 'low': 2.1}).rename({'rule.severity': 'severity'}, axis=1)
data = data[data['mitre_technique'].apply(lambda x: x in top10_techniques)].set_index('mitre_technique').loc[top10_techniques].reset_index()
data

Unnamed: 0,mitre_technique,severity,ruleset
0,T1059,7.3,Elastic
1,T1059,7.3,Elastic
2,T1059,4.7,Elastic
3,T1059,7.3,Elastic
4,T1059,4.7,Elastic
...,...,...,...
242,T1547,4.7,Elastic
243,T1547,7.3,Elastic
244,T1547,4.7,Elastic
245,T1547,4.7,Elastic


## Confidence - Splunk

In [31]:
top10_techniques = list(reversed(technique_coverage_percent.index.values[-10:]))
data = splunk_flattened[['mitre_technique', 'tags.confidence']].assign(ruleset=splunk_flattened['mitre_technique'].apply(lambda x: 'Splunk')).rename({'tags.confidence': 'confidence'}, axis=1)
data = data[data['mitre_technique'].apply(lambda x: x in top10_techniques)].set_index('mitre_technique').loc[top10_techniques].reset_index()
data

Unnamed: 0,mitre_technique,confidence,ruleset
0,T1059,40,Splunk
1,T1059,30,Splunk
2,T1059,40,Splunk
3,T1059,40,Splunk
4,T1059,50,Splunk
...,...,...,...
665,T1547,70,Splunk
666,T1547,70,Splunk
667,T1547,50,Splunk
668,T1547,100,Splunk


## Criticality Level - Sigma

In [32]:
sigma_level_df = sigma[['mitre_attack_id', 'level']]
sigma_level_flat = []
for _, rule in sigma_level_df.iterrows():
    for technique in rule.mitre_attack_id:
        sigma_level_flat.append([technique, rule.level])
sigma_level_flat_df = pd.DataFrame(sigma_level_flat, columns=['mitre_attack_id', 'level'])
sigma_level_map = {'critical': 9.9, 'high': 7.3, 'medium': 4.7, 'low': 2.1, 'informational': 0}
sigma_level_flat_df['level'] = sigma_level_flat_df['level'].apply(lambda x: sigma_level_map[x])
sigma_level_flat_df['ruleset'] = sigma_level_flat_df['mitre_attack_id'].apply(lambda x: 'Sigma')
sigma_level_flat_df

Unnamed: 0,mitre_attack_id,level,ruleset
0,T1219,9.9,Sigma
1,T1203,9.9,Sigma
2,T1204,7.3,Sigma
3,T1558,9.9,Sigma
4,T1003,9.9,Sigma
...,...,...,...
2730,T1068,7.3,Sigma
2731,T1190,7.3,Sigma
2732,T1203,7.3,Sigma
2733,T1574,4.7,Sigma
