In [1]:
import numpy as np
import gzip
import json
import pandas as pd
import os
import sys
import urllib  
import re  
import nltk
import gensim

from sklearn.manifold import TSNE, MDS, Isomap
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

# Step 0:  Helper Functions

In [2]:
def extract(json):
    """
    input: json object
    output: (cve description, cve published data, cve ID)
    """
    try:
        return (json['cve']['description']['description_data'][0]['value'], 
                pd.to_datetime(json['publishedDate']), json['cve']['CVE_data_meta']['ID'])
    except:
        return (json['description']['description_data'][0]['value'], 
                pd.to_datetime(json['publishedDate']), json['cve']['CVE_data_meta']['ID'])

    
#replace_dots: get rid of any extension '.'s so they are not interpreted as full-stops
def replace_dots(text):
    try:
        ind = text.index('.')
        while ind < len(text)-1:
            if not text[ind+1:ind+2] == ' ' and not text[ind+1:ind+2] == '"' and not text[ind+1:ind+2] == '\'':
                text = text[:ind] + '_' + text[ind+1:]
            try:
                ind = ind+1 + text[ind+1:].index('.')
            except:
                break
        return text
    except:
        return text

    
def remove_urls(text):
    text = re.sub(r'\[?\S+\]?\(?https?://\S+\)?', '', text)
    return text

def remove_citations(text):
    text = re.sub(r'\(Citations?: \S+\)', '', text)
    return text
    
    
#clean up the text
def remove_chars(text):
    to_remove = "This technique has been deprecated. Please see ATT&CK's Initial Access and Execution tactics for replacement techniques."
    text = text.replace(to_remove,'')
    text = re.sub('<[^>]*>', '', text.lower()).strip()
    text = re.sub('[^a-zA-Z\'\_]', ' ', text.lower())
    return text

def clean_text(text):
    clean = remove_citations(text)
    clean = remove_urls(clean)
    clean = replace_dots(clean)
    clean = remove_chars(clean)
    return clean


##****************MITRE Functions****************##

def get_all_software(src):
    filts = [
        [Filter('type', '=', 'malware')],
        [Filter('type', '=', 'tool')]
    ]
    return list(chain.from_iterable(
        src.query(f) for f in filts
    ))

def get_all_techniques(src):
    filt = [Filter('type', '=', 'attack-pattern')]
    return src.query(filt)
    
def get_technique_by_name(src, name):
    filt = [
        Filter('type', '=', 'attack-pattern'),
        Filter('name', '=', name)
    ]
    return src.query(filt)

def get_techniques_by_content(src, content):
    techniques = get_all_techniques(src)
    return [
        tech for tech in techniques
        if content.lower() in tech.description.lower()
    ]

def get_mitigations_by_technique(src, tech_stix_id):
    relations = src.relationships(tech_stix_id, 'mitigates', target_only=True)
    return src.query([
        Filter('type', '=', 'course-of-action'),
        Filter('id', 'in', [r.source_ref for r in relations])
    ])

def get_group_by_technique(src, tech_stix_id):
    relations = src.relationships(tech_stix_id, 'uses', target_only=True)
    return src.query([
        Filter('type', '=', 'intrusion-set'),
        Filter('id', 'in', [r.source_ref for r in relations])
    ])

# Step 1: Extract Data

- Extract data from compressed files from NIST

## MITRE

In [3]:
from stix2 import FileSystemSource
from stix2 import Filter

fs = FileSystemSource('enterprise-attack')
filt = Filter('type', '=', 'attack-pattern')
techniques_ent = fs.query([filt])

fs = FileSystemSource('pre-attack')
filt = Filter('type', '=', 'attack-pattern')
techniques_pre = fs.query([filt])

## NIST NVD

In [4]:
#read compressed files downloaded from NIST NVD 

mypath = 'nist_historical'
onlyfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
onlyfiles.remove('.DS_Store')


blobs = {}

for fname in onlyfiles:
    with gzip.open(mypath+'/'+fname) as f:
        blobs[fname] = json.load(f)
    f.close()

# Step 2: Dedupe + Clean Data

- Clean up ATT&CK data + add to dataframe
- Dedupe CVE entries to account for any repetitions within the NIST compressed folders + add to dataframe

## MITRE

In [5]:
ent_dict = {obj['id']: (obj['name'], obj['description']) for obj in techniques_ent}
pre_dict = {obj['id']: (obj['name'], obj['description']) for obj in techniques_pre}

ent_df = pd.DataFrame({'attack_id':ent_dict.keys(), 'values': ent_dict.values()})
pre_df = pd.DataFrame({'attack_id':pre_dict.keys(), 'values': pre_dict.values()})

ent_df['type'] = ['ent_attack']*ent_df.shape[0]
pre_df['type'] = ['pre_attack']*pre_df.shape[0]

techniques_df = pd.concat([ent_df, pre_df], axis=0)
techniques_df['attack_name'] = techniques_df['values'].apply(lambda x: x[0].encode('utf-8').strip())
techniques_df['attack_description'] = techniques_df['values'].apply(lambda x: x[1].encode('utf-8').strip())
techniques_df.dropna(inplace=True)
techniques_df['cleanText'] = techniques_df['attack_description'].apply(lambda x: clean_text(x))

## NIST NVD

In [6]:
#cve_descs stores all of the cve descriptions from each JSON object
cve_descs = {}

for key in blobs.keys():
    for n, report_meat in enumerate(blobs[key]['CVE_Items']):
        desc, date, cve_id = extract(report_meat)
        cve_descs[cve_id] = (desc,date)
        
#add all the values to a 3rd dataframe
cve_df = pd.DataFrame({"cveID":cve_descs.keys(), "cveDescription": [v[0] for v in cve_descs.values()],
                               "publishedTime": [v[1] for v in cve_descs.values()]})

# final deduping of all 3 dataframes
all_zip = zip(cve_df['cveID'], cve_df['cveDescription']) + zip(techniques_df['attack_id'],techniques_df['attack_description'])
final_dict = {key: value for key,value in all_zip}

# # add it to a datframe
cve_df['cleanText'] = cve_df['cveDescription'].apply(lambda x: replace_dots(x))
cve_df['cleanText'] = cve_df['cleanText'].apply(lambda x: clean_text(x))

# Step 3: Create Named Tuples

Gensim requires NamedTuples for processing documents. NamedTuples have the added convenience of associating tags with documents so it's easy to join documents back to their associated metadata.

In [7]:
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
# plus adds other state helpful for our later evaluation/reporting
Document = namedtuple('TaggedDocument', 'words tags object_type')

alldocs = []
for row_id, row in zip(techniques_df['attack_id'],techniques_df['cleanText']):
    words = gensim.utils.to_unicode(row).split()
    tags = [row_id]
    object_type = 'ATTACK'
    alldocs.append(Document(words, tags, object_type))
    
for row_id, row in zip(cve_df['cveID'],cve_df['cleanText']):
    words = gensim.utils.to_unicode(row).split()
    tags = [row_id] # 'tags = [tokens[0]]' would also work at extra memory cost
    object_type = 'CVE'
    alldocs.append(Document(words, tags, object_type))

# Step 4: Set up Doc2Vec Training

We approximate the experiment of Le & Mikolov ["Distributed Representations of Sentences and Documents"](http://cs.stanford.edu/~quocle/paragraph_vector.pdf) with guidance from Mikolov's [example go.sh](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ):

`./word2vec -train ../alldata-id.txt -output vectors.txt -cbow 0 -size 100 -window 10 -negative 5 -hs 0 -sample 1e-4 -threads 40 -binary 0 -iter 20 -min-count 1 -sentence-vectors 1`

We vary the following parameter choices:
* 100-dimensional vectors, as the 400-d vectors of the paper take a lot of memory and, in our tests of this task, don't seem to offer much benefit
* Similarly, frequent word subsampling seems to decrease sentiment-prediction accuracy, so it's left out
* `cbow=0` means skip-gram which is equivalent to the paper's 'PV-DBOW' mode, matched in gensim with `dm=0`
* Added to that DBOW model are two DM models, one which averages context vectors (`dm_mean`) and one which concatenates them (`dm_concat`, resulting in a much larger, slower, more data-hungry model)
* A `min_count=2` saves quite a bit of model memory, discarding only words that appear in a single doc (and are thus no more expressive than the unique-to-each doc vectors themselves)

In [8]:
# %%time
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=10, sample=0, 
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=10, sample=0, 
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=10, sample=0, 
            epochs=20, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dbow,d100,n5,mc10,t8) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc10,t8) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc10,t8) vocabulary scanned & state initialized


In [9]:
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [10]:
for model in simple_models: 
    print("Training %s" % model)
    %time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

Training Doc2Vec(dbow,d100,n5,mc10,t8)
CPU times: user 2min 56s, sys: 37.4 s, total: 3min 34s
Wall time: 1min 50s
Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc10,t8)
CPU times: user 4min 7s, sys: 51.1 s, total: 4min 58s
Wall time: 2min 20s
Training Doc2Vec(dm/c,d100,n5,w5,mc10,t8)
CPU times: user 6min 50s, sys: 51 s, total: 7min 41s
Wall time: 2min 22s


In [11]:
word_models = simple_models[:]

# Step 5: Run with it!

## How similar are 'similar' words?
 Play around with words by changing the variable 'word'

In [13]:
import random
from IPython.display import HTML
# pick a random word with a suitable number of occurences
while True:
    word = random.choice(word_models[0].wv.index2word)
    if word_models[0].wv.vocab[word].count > 200:
        break
# or uncomment below line, to just pick a word from the relevant domain:
word = 'execute'
similars_per_model = [str(model.wv.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for 'execute' (25862 occurences)


"Doc2Vec(dbow,d100,n5,mc10,t8)","Doc2Vec(""alpha=0.05"",dm/m,d100,n5,w10,mc10,t8)","Doc2Vec(dm/c,d100,n5,w5,mc10,t8)"
"[(u'mdm', 0.4347737431526184), (u'activematrix', 0.344107985496521), (u'affected', 0.3438035845756531), (u'uses', 0.31682759523391724), (u'caused', 0.3130490779876709), (u'awa', 0.31220555305480957), (u'libvpx', 0.3115987181663513), (u'gems', 0.31055089831352234), (u'algorithms', 0.3047991991043091), (u'_server', 0.3032001256942749), (u'xserver', 0.30311456322669983), (u'trigger', 0.30237576365470886), (u'qualified', 0.29969215393066406), (u'realplayer', 0.2984297275543213), (u'co', 0.2973718047142029), (u'share', 0.29577529430389404), (u'grails', 0.29440414905548096), (u'tries', 0.29240185022354126), (u'h_', 0.2895732522010803), (u'installers', 0.2866915762424469)]","[(u'run', 0.8072041273117065), (u'inject', 0.6828826665878296), (u'read', 0.6366456747055054), (u'execution', 0.6291384100914001), (u'overwrite', 0.5641766786575317), (u'executing', 0.535063624382019), (u'executed', 0.48166775703430176), (u'create', 0.48140668869018555), (u'perform', 0.43708884716033936), (u'write', 0.42084765434265137), (u'signing', 0.4109715521335602), (u'delete', 0.4091721475124359), (u'download', 0.40684735774993896), (u'load', 0.3937813937664032), (u'directories', 0.3863033652305603), (u'change', 0.3854135572910309), (u'e_g_', 0.37903088331222534), (u'submit', 0.37315118312835693), (u'embed', 0.37288904190063477), (u'send', 0.3692713975906372)]","[(u'run', 0.5840804576873779), (u'invoke', 0.5547510385513306), (u'inject', 0.5293723344802856), (u'accomplish', 0.48468339443206787), (u'constrain', 0.4820373058319092), (u'isolate', 0.4772493839263916), (u'executing', 0.47279685735702515), (u'siclock', 0.46621909737586975), (u'deserialize', 0.462011456489563), (u'collateral', 0.4559507369995117), (u'achieve', 0.4514831602573395), (u'evaluated', 0.4508994221687317), (u'hooked', 0.4488168954849243), (u'forensics', 0.44768086075782776), (u'ntdll_', 0.44571202993392944), (u'nnmrptconfig_exe', 0.4428020417690277), (u'matrimonial', 0.4416455030441284), (u'nlst', 0.4411974549293518), (u'forge', 0.44059404730796814), (u'replaced', 0.4352682828903198)]"


## Compare different models by printing closest documents
The randomizer picks any document at random and prints the documents closest to it. The next two blocks use PV-DM and PV-DBOW respectively

### PV-DM

In [14]:
import random
import numpy as np
doc_id = np.random.randint(397)  # pick random doc, re-run cell for more examples
model = simple_models[1]  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%s): «%s»\n' % (alldocs[doc_id][1][0], final_dict[alldocs[doc_id][1][0]]))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], final_dict[sims[index][0]]))

TARGET (attack-pattern--0649fc36-72a0-40a0-a2f9-3fc7e3231ad6): «Callbacks are malware communications seeking instructions. An adversary will test their malware to ensure the appropriate instructions are conveyed and the callback software can be reached. (Citation: LeeBeaconing)»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc10,t8):

MOST (u'CVE-2018-7053', 0.6692964434623718): «An issue was discovered in Irssi before 1.0.7 and 1.1.x before 1.1.1. There is a use-after-free when SASL messages are received in an unexpected order.»

MEDIAN (u'CVE-2010-0019', 0.4216613471508026): «Microsoft Silverlight 3 before 3.0.50611.0 on Windows, and before 3.0.41130.0 on Mac OS X, does not properly handle pointers, which allows remote attackers to execute arbitrary code or cause a denial of service (memory corruption and framework outage) via a crafted web site, aka "Microsoft Silverlight Memory Corruption Vulnerability."»

LEAST (u'CVE-2017-12170', -0.17178192734718323): 

### PV-DBOW

In [15]:
  # pick random doc, re-run cell for more examples
## 7189, 41355, 29820
doc_id = 78030
# doc_id = np.random.randint(simple_models[1].docvecs.count)  # pick random doc, re-run cell for more examples
model = simple_models[0]  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%s): «%s»\n' % (alldocs[doc_id][1][0], final_dict[alldocs[doc_id][1][0]]))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], final_dict[sims[index][0]]))
    

TARGET (CVE-2017-5161): «An issue was discovered in Sielco Sistemi Winlog Lite SCADA Software, versions prior to Version 3.02.01, and Winlog Pro SCADA Software, versions prior to Version 3.02.01. An uncontrolled search path element (DLL Hijacking) vulnerability has been identified. Exploitation of this vulnerability could give an attacker access to the system with the same level of privilege as the application that utilizes the malicious DLL.»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d100,n5,mc10,t8):

MOST (u'CVE-2018-14812', 0.9075496196746826): «An uncontrolled search path element (DLL Hijacking) vulnerability has been identified in Fuji Electric Energy Savings Estimator versions V.1.0.2.0 and prior. Exploitation of this vulnerability could give an attacker access to the system with the same level of privilege as the application that utilizes the malicious DLL.»

MEDIAN (u'CVE-2008-2834', 0.20582585036754608): «SQL injection vulnerability in projects.php in Scientific Image D

## Clusters
- Create a 100 clusters
- Explore each by changing the cluster index in the second cell

In [16]:
from sklearn.cluster import KMeans

NUM_CLUSTERS = 100

docvecs = []
keys = []
for doc_id in range(len(model.docvecs)):
    keys.append(alldocs[doc_id][1][0])
    docvecs.append(model.docvecs[doc_id])
docvecs = np.array(docvecs)

kclusterer = KMeans(n_clusters=NUM_CLUSTERS, random_state=333, max_iter=100)
assigned_clusters = kclusterer.fit(docvecs)

#add to dictionary
key_cluster = {}
for key,cluster in zip(keys, assigned_clusters.labels_):
    if cluster in key_cluster:
        key_cluster[cluster].append(key)
    else: 
        key_cluster[cluster] = [key]

In [17]:
#change cluster number to see any random cluster
cluster = 5
for i in key_cluster[cluster][:5]:
    print i, " >> ", final_dict[i]
    print

CVE-2014-3219  >>  fish before 2.1.1 allows local users to write to arbitrary files via a symlink attack on (1) /tmp/fishd.log.%s, (2) /tmp/.pac-cache.$USER, (3) /tmp/.yum-cache.$USER, or (4) /tmp/.rpm-cache.$USER.

CVE-2012-0426  >>  Race condition in sap_suse_cluster_connector before 1.0.0-0.8.1 in SUSE Linux Enterprise for SAP Applications 11 SP2 allows local users to have an unspecified impact via vectors related to a tmp/ directory.

CVE-2012-0420  >>  zypp-refresh-wrapper in SUSE Zypper before 1.3.20 and 1.6.x before 1.6.166 allows local users to create files in arbitrary directories, or possibly have unspecified other impact, via a pathname in the ZYPP_LOCKFILE_ROOT environment variable.

CVE-2017-12414  >>  Format Factory 4.1.0 has a DLL Hijacking Vulnerability because an untrusted search path is used for msimg32.dll, WindowsCodecs.dll, and dwmapi.dll.

CVE-2017-11160  >>  Multiple untrusted search path vulnerabilities in installer in Synology Assistant before 6.1-15163 on Wind

## Find Similar ATT&CK Pattern or CVE

In [18]:
def find_similar(row_id, match_with, alldocs=alldocs, final_dict=final_dict, model=model):
    """
    row_id: attack-pattern id or CVE id
    match_with: 1=CVE, 2=attack
    """
    ind = {1:'CVE', 2:'attack-pattern'}
    doc_id = [i for i in range(len(alldocs)) if alldocs[i][1][0] == row_id]
    sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)
    sim_row_id = [i for i in sims if i[0][:3] == ind[match_with][:3]][0] #this ensures the top 1
    print("**You asked for a(n) %s closest to the %s which is described as:**" %(ind[match_with], row_id))
    print
    print(final_dict[row_id])
    print
    print("**The closest %s to %s is %s and is described as:**"%(ind[match_with], row_id, sim_row_id[0]))
    print
    print(final_dict[sim_row_id[0]])
    print
    print("**Their similarity score is: %0.4f**"%sim_row_id[1])

In [19]:
find_similar('attack-pattern--aa8bfbc9-78dc-41a4-a03b-7453e0fdccda', 1)

**You asked for a(n) CVE closest to the attack-pattern--aa8bfbc9-78dc-41a4-a03b-7453e0fdccda which is described as:**

macOS and OS X use a common method to look for required dynamic libraries (dylib) to load into a program based on search paths. Adversaries can take advantage of ambiguous paths to plant dylibs to gain privilege escalation or persistence.

A common method is to see what dylibs an application uses, then plant a malicious version with the same name higher up in the search path. This typically results in the dylib being in the same folder as the application itself. (Citation: Writing Bad Malware for OSX) (Citation: Malware Persistence on OS X)

If the program is configured to run at a higher privilege level than the current user, then when the dylib is loaded into the application, the dylib will also run at that elevated level. This can be used by adversaries as a privilege escalation technique.

**The closest CVE to attack-pattern--aa8bfbc9-78dc-41a4-a03b-7453e0fdccda is