In [1]:
import os
import json
from scipy import spatial
from sentence_transformers import SentenceTransformer
import pandas as pd
import nltk



In [2]:
bert_model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
df = pd.read_csv('experiments-new - enterprise-techniques.csv')

In [4]:
attack_pattern_dict = {} 
technique_mapping = {}

prev_id = None

for _, row in df.iterrows():
    _id = row['ID']
    if not pd.isnull(_id):
        attack_pattern_dict[_id] = [[row['Name'], row['Description']]]
        prev_id = _id
        technique_mapping[row['Name']] = _id
    else:
        attack_pattern_dict[prev_id].append([row['Name'], row['Description']])
        technique_mapping[row['Name']] = prev_id

In [10]:
len(attack_pattern_dict)

191

In [20]:
# attack_pattern_dict

In [21]:
embedding_memo = {}

In [22]:
def get_embedding(txt):
    if txt in embedding_memo:
        return embedding_memo[txt]
    emb = bert_model.encode([txt])[0]
    embedding_memo[txt] = emb
    return emb

In [23]:
def get_embedding_distance(txt1, txt2):
    p1 = get_embedding(txt1)
    p2 = get_embedding(txt2)
    score = spatial.distance.cosine(p1, p2)
    return score

In [37]:
import Levenshtein

def get_mitre_id(text):
    min_dist = 25
    ret = None
    for k, tech_list in attack_pattern_dict.items():
        for v in tech_list:
            # v[0] -> attack pattern title, v[1] -> description
            # lav_dist = 1 - Levenshtein.ratio(text, v[0])
            d = (0.5*get_embedding_distance(text, v[0]) + 0.5*get_embedding_distance(text, v[1]))
            if d < min_dist:
                min_dist = d
                # ret = [k, v]
                ret = k
    return ret, min_dist

In [38]:
def get_mitre_id_list(text, th=0.6):
    ret = []
    for k, tech_list in attack_pattern_dict.items():
        for v in tech_list:
            # v[0] -> attack pattern title, v[1] -> description
            lav_dist = 1 - Levenshtein.ratio(text, v[0])
            d = (0.5*get_embedding_distance(text, v[0]) + 0.5*get_embedding_distance(text, v[1]))
            if d < th:
                ret.append(k)
    return list(set(ret))

In [32]:
def remove_consec_newline(s):
    ret = s[0]
    for x in s[1:]:
        if not (x == ret[-1] and ret[-1]=='\n'):
            ret += x
    return ret

In [39]:
def get_all_attack_patterns(fname, th=0.6):
    mapped = {}
    with open(fname, 'r', encoding='utf-8') as f:
        text = f.read()
    
    text = remove_consec_newline(text)
    text = text.replace('\t', ' ')
    text = text.replace("\'", "'")
    sents_nltk = nltk.sent_tokenize(text)
    sents = []
    for x in sents_nltk:
        sents += x.split('\n')
    for line in sents:
        if len(line) > 0:
            _id, dist = get_mitre_id(line)
            if dist < th:
                if _id not in mapped:
                    mapped[_id] = dist, line
                else:
                    if dist < mapped[_id][0]:
                        mapped[_id] = mapped[_id] = dist, line
    return mapped

In [40]:
def get_all_attack_patterns_list(fname, th=0.6):
    mapped = {}
    with open(fname, 'r', encoding='utf-8') as f:
        text = f.read()
    
    text = remove_consec_newline(text)
    text = text.replace('\t', ' ')
    text = text.replace("\'", "'")
    sents_nltk = nltk.sent_tokenize(text)
    sents = []
    for x in sents_nltk:
        sents += x.split('\n')
    for line in sents:
        if len(line) > 0:
            ret = get_mitre_id(line, th)
            for x in ret:
                mapped[x] = line
    return mapped

In [42]:
ret = get_all_attack_patterns('darpa-out/nation-state.txt', th=0.6)

for k, v in ret.items():
    print(k, v, attack_pattern_dict[k][0][0])

T1505 (0.5310649275779724, 'exploit the webserver hosted on FreeBSD') Server Software Component
T1040 (0.5937256217002869, 'monitor connections and network activity while residing on the FreeBSD host') Network Sniffing
T1592 (0.3681303858757019, 'target and exploit the discovered hosts to exfil proprietary data') Gather Victim Host Information
T1588 (0.4903438538312912, 'exploit it') Obtain Capabilities
T1134 (0.38940680027008057, 'ability to create a new elevated process') Access Token Manipulation
T1055 (0.30760031938552856, 'ability to inject a .dll or .so into a process') Process Injection


In [43]:
ret = get_all_attack_patterns('darpa-out/3.txt', th=0.6)

for k, v in ret.items():
    print(k, v, attack_pattern_dict[k][0][0])

T1185 (0.47015178203582764, 'exploiting Firefox 54.0.1 using a malicious ad server') Browser Session Hijacking
T1204 (0.5067747831344604, 'attacker used putfile to') User Execution
T1574 (0.5645609796047211, 'executed the drakon implant from the target disk using a privilege escalated execution capability') Hijack Execution Flow
T1014 (0.5576980113983154, 'new root drakon implant process connected out to the operator console to give') Rootkit
T1546 (0.5680103749036789, 'putfile profile') Event Triggered Execution
