<a href="https://colab.research.google.com/github/aiforsec22/IEEEEuroSP23/blob/main/notebooks/attack-pattern-extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing dependencies

In [None]:
!git clone https://github.com/aiforsec/LADDER.git

In [None]:
%cd LADDER/attack_pattern/

In [None]:
!pip install -r requirements.txt

#### Download pretrained models

In [None]:
!gdown https://drive.google.com/uc?id=1yYRNoV4SFwcS1HAgrwxNQftqQVnaXLGo

In [None]:
!gdown https://drive.google.com/uc?id=15YJgo4iqfQ7zWoHLBOnHOW4BV3hsGENY

In [None]:
!mkdir models

In [None]:
!mv sent_cls.pt models/sent_cls.pt

In [None]:
!mv entity_ext.pt models/entity_ext.pt

### Extract attack patterns with sentence classification and entity extraction model from a given folder

In [None]:
!python inference.py --entity-extraction-weight=models/entity_ext.pt --sentence-classification-weight=models/sent_cls.pt --input-doc=test_input --save-path=test_output

### Map Extracted AttackPatterns to MITRE (enterprise) ID

In [None]:
!pip install sentence-transformers==2.2.0

In [None]:
import os
import json
from scipy import spatial
from sentence_transformers import SentenceTransformer
import pandas as pd
import nltk

In [None]:
bert_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
df = pd.read_csv('enterprise-techniques.csv')

In [None]:
attack_pattern_dict = {} 
technique_mapping = {}

prev_id = None

for _, row in df.iterrows():
    _id = row['ID']
    if not pd.isnull(_id):
        attack_pattern_dict[_id] = [[row['Name'], row['Description']]]
        prev_id = _id
        technique_mapping[row['Name']] = _id
    else:
        attack_pattern_dict[prev_id].append([row['Name'], row['Description']])
        technique_mapping[row['Name']] = prev_id

In [None]:
embedding_memo = {}

In [None]:
def get_embedding(txt):
    if txt in embedding_memo:
        return embedding_memo[txt]
    emb = bert_model.encode([txt])[0]
    embedding_memo[txt] = emb
    return emb

In [None]:
def get_embedding_distance(txt1, txt2):
    p1 = get_embedding(txt1)
    p2 = get_embedding(txt2)
    score = spatial.distance.cosine(p1, p2)
    return score

In [None]:
def get_mitre_id(text):
    min_dist = 25
    ret = None
    for k, tech_list in attack_pattern_dict.items():
        for v in tech_list:
            # v[0] -> attack pattern title, v[1] -> description
            d = (0.5*get_embedding_distance(text, v[0]) + 0.5*get_embedding_distance(text, v[1]))
            if d < min_dist:
                min_dist = d
                ret = k
    return ret, min_dist

In [None]:
def remove_consec_newline(s):
    ret = s[0]
    for x in s[1:]:
        if not (x == ret[-1] and ret[-1]=='\n'):
            ret += x
    return ret

In [None]:
def get_all_attack_patterns(fname, th=0.6):
    mapped = {}
    with open(fname, 'r', encoding='utf-8') as f:
        text = f.read()
    
    text = remove_consec_newline(text)
    text = text.replace('\t', ' ')
    text = text.replace("\'", "'")
    sents_nltk = nltk.sent_tokenize(text)
    sents = []
    for x in sents_nltk:
        sents += x.split('\n')
    for line in sents:
        if len(line) > 0:
            _id, dist = get_mitre_id(line)
            if dist < th:
                if _id not in mapped:
                    mapped[_id] = dist, line
                else:
                    if dist < mapped[_id][0]:
                        mapped[_id] = mapped[_id] = dist, line
    return mapped

#### Print all extracted attack patterns with distance, mapped MITRE ID and title

In [None]:
ret = get_all_attack_patterns('test_output/litepower.txt', th=0.6)

for k, v in ret.items():
    print(k, v, attack_pattern_dict[k][0][0])

In [None]:
# should print the following

# T1564 (0.46478238701820374, 'use hidden spreadsheets and VBA macros to drop their first stage implant') Hide Artifacts
# T1574 (0.45225709676742554, 'Exploitation , installation and persistence') Hijack Execution Flow
# T1204 (0.39565354585647583, 'Malicious documents and droppers') User Execution
# T1589 (0.5162111669778824, 'tailored the decoy contents to the targeted victims') Gather Victim Identity Information
# T1059 (0.44848522543907166, 'payloads containing system commands are sent back to the victim in the form of PowerShell functions through HTTP GET requests') Command and Scripting Interpreter
# T1547 (0.386538028717041, 'set up registry keys for persistence') Boot or Logon Autostart Execution
# T1112 (0.407908171415329, 'Registry keys used for COM hijacking') Modify Registry
# T1588 (0.4554292559623718, 'download and deploy further malware') Obtain Capabilities
# T1546 (0.5960709452629089, 'LitePower PowerShell implant') Event Triggered Execution
# T1041 (0.5173896849155426, 'C2 communications using') Exfiltration Over C2 Channel
# T1497 (0.48617543280124664, 'conducts system reconnaissance to assess the AV software installed and the user privilege') Virtualization/Sandbox Evasion
# T1053 (0.4468293786048889, 'creation of a legitimate scheduled task to trigger “ Scripting.Dictionary ” COM programs') Scheduled Task/Job
# T1021 (0.5951826721429825, 'referencing SLMGR.VBS to trigger WINRM.VBS through COM hijacking') Remote Services
# T1007 (0.4255487024784088, 'checks for possible backdoors installed as services') System Service Discovery
# T1012 (0.4465780556201935, 'checks for the registry keys added for COM hijacking') Query Registry
# T1113 (0.572499006986618, 'takes system screenshots and saves them to % AppData % before sending them to the C2 via a POST request') Screen Capture
# T1598 (0.31141629815101624, 'potentially delivered through spear phishing') Phishing for Information