# Dependent Libraries

In [1]:
import csv
import re
import pandas as pd
import spacy
import textacy
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi

# Preprocess Input Data

### - Load Data and Remove Duplicates

In [2]:
df = pd.read_csv('./cve_data_description_only.csv')
df.dropna(inplace=True)

cve_arr = []

for cve in df['Description']:
    if '**' not in cve:
        cve_arr.append(cve)

#remove duplicates
cve_arr = list(set(cve_arr))

### - Using Regular Expressions to Replace IP Addresses, URLs, exe files, and CVE Report IDs

In [3]:
ip_regex = r'[0-9]+(?:\.[0-9]+){3}'
url_regex = r'(?P<url>https?://[^\s]+)'
exe_regex = r'\b\S*\.exe\b'
report_regex = r'CVE-\d{4}-\d{4,7}'

temp = cve_arr[:]
cve_arr_clean = []
ip_slot_map = []


for i in range(len(temp)):
    cve = temp[i]
    ip_arr = re.findall(ip_regex, cve)
    
    if len(ip_arr) > 0:
        ip_slot_map.append({'index': i, 'ip_data': ip_arr})
    
    for ip in ip_arr:
        cve = cve.replace(ip,'IP_ADDRESS_STRING')
    cve_arr_clean.append(cve)

#subsitute URLs
temp = cve_arr_clean[:]
cve_arr_clean = []
url_slot_map = []

for i in range(len(temp)):
    cve = temp[i]
    url_arr = re.findall(url_regex, cve)
    if len(url_arr) > 0:
        url_slot_map.append({'index': i, 'url_data': url_arr})
    
    for url in url_arr:
        cve = cve.replace(url,'URL_STRING')
    cve_arr_clean.append(cve)
    
#subsitute executable files
temp = cve_arr_clean[:]
cve_arr_clean = []
exe_slot_map = []

for i in range(len(temp)):
    cve = temp[i]
    exe_arr = re.findall(exe_regex, cve)
    if len(exe_arr) > 0:
        exe_slot_map.append({'index': i, 'exe_data': exe_arr})
    
    for exe in exe_arr:
        cve = cve.replace(exe,'EXE_STRING')
    cve_arr_clean.append(cve)
    
#subsitute CVE report labels
temp = cve_arr_clean[:]
cve_arr_clean = []
report_slot_map = []

for i in range(len(temp)):
    cve = temp[i]
    report_arr = re.findall(report_regex, cve)
    if len(report_arr) > 0:
        report_slot_map.append({'index': i, 'report_data': report_arr})
    
    for report in report_arr:
        cve = cve.replace(report,'REPORT_STRING')
    cve_arr_clean.append(cve)

### - Remove Stop Words, Numbers, and Punctuation

In [None]:
#I found more complete results without the additional preprocessing
'''
stop_words = set(stopwords.words('english')) 

temp = cve_arr_clean[:]
cve_arr_clean = []

for cve in temp:
    word_tokens = word_tokenize(cve) 
    filtered_sentence = [w for w in word_tokens if (not w in stop_words) and (w.isalpha())] 
    str_val = ' '
    str_val = str_val.join(filtered_sentence)
    cve_arr_clean.append(str_val)
'''

# Subject-Verb-Object Extraction

In [4]:
def generate_canindates(SVO_arr):
    canindates = []

    for tup in SVO_arr:

        #convert tuple to list of strings
        tup_as_list = []
        for val in tup:
            tup_as_list.append(str(val))

        #convert list of strings to single string
        list_as_str = " "
        list_as_str = list_as_str.join(tup_as_list)

        #create list of canindate threat actions for each cve
        canindates.append(list_as_str)

    return canindates

In [5]:
#Couldn't get Stanford Dependency Parser to work so I choose spaCy to extract the SVOs(Subject-Verb-Object)
nlp = spacy.load('en')

In [6]:
#List of CVEs
#Each CVEs has a list of SVOs strings for each sentence -> ['Sentence 1 SVO', 'Sentence 2 SVO', 'Sentence 3 SVO',....]
SVOs_for_each_CVE =  []

for clean_cve in cve_arr_clean:
    text = nlp(clean_cve)
    text_ext = textacy.extract.subject_verb_object_triples(text)
    SVO_arr = list(text_ext)
    SVOs_for_each_CVE.append(generate_canindates(SVO_arr))

# Calculate Similarity Score

In [7]:
def similarity_score(scorer,input_arr):
    score_arr = scorer.get_scores(input_arr).tolist()
    max_score = max(score_arr)
    max_idx = score_arr.index(max_score)
    max_val = threat_ontology[max_idx]
    return max(score_arr), max_val

In [8]:
#Use sliding window method with a size of 3
#Consider 3 sentences at a time
#Decide whether to group the sentences in the window or keep them seperate for each CVE

#Bag of Words representation of our cyber threat ontology 
threat_ontology = [
    ['phishing', 'spear', 'whale'],
    ['malware', 'ransomware', 'macro','virus','file', 'infectors', 'system', 'boot-record', 'polymorphic', 'sleath', 'trojan', 'horses', 'logic', 'bomb', 'worms', 'droppers', 'adware' , 'spyware'],
    ['web', 'cross', 'site', 'scripting'],
    ['ddos', 'TCP','SYN', 'flood', 'teardrop', 'smurf', 'ping', 'death', 'botnets'],
    ['mitm', 'session', 'hijacking', 'ip', 'spoofing', 'replay'],
    ['password', 'brute', 'force', 'dictionary'],
    ['eavesdropping', 'passive', 'active']
]

#train the ranking function on the bag of word representation of our cyber threat ontology
bm25 = BM25Okapi(threat_ontology)

#final output array
results = []

#sliding window
window_size = 3
for x in range(len(SVOs_for_each_CVE)):
    SVOs = SVOs_for_each_CVE[x]
    
    #iterate of the SVOs for each sentence in a single CVE
    #only consider grouping sentences of CVE that have at least 3 sentences
    if len(SVOs) >= window_size:
        i = 0
        last_idx = window_size - 1
        long_cve_result = []
        
        while i < (len(SVOs) - last_idx):
            current_window = SVOs[i:(i+window_size)]
            tokenized_current_window = [doc.split(" ") for doc in current_window]
            single_canindate = tokenized_current_window[0]
            multiple_canindates = tokenized_current_window[0] + tokenized_current_window[1] + tokenized_current_window[2]
            single_score, single_val = similarity_score(bm25, single_canindate)
            multi_score, multi_val = similarity_score(bm25, multiple_canindates)
            
            if single_score == 0 and multi_score == 0:
                long_cve_result.append([])
                i += 1
            else:
                if multi_score > single_score:
                    long_cve_result.append(multi_val)
                    i += 3
                else:
                    long_cve_result.append(single_val)
                    i += 1
        
        results.append(long_cve_result)
    elif len(SVOs) == 2:
        current_window = SVOs
        tokenized_current_window = [doc.split(" ") for doc in current_window]
        single_canindate = tokenized_current_window[0]
        multiple_canindates = tokenized_current_window[0] + tokenized_current_window[1]
        single_score, single_val = similarity_score(bm25, single_canindate)
        multi_score, multi_val = similarity_score(bm25, multiple_canindates)

        if single_score == 0 and multi_score == 0:
            results.append([])
        else:
            if multi_score > single_score:
                results.append(multi_val)
            else:
                results.append(single_val)
    elif len(SVOs) == 1:
        current_window = SVOs
        tokenized_current_window = [doc.split(" ") for doc in current_window]
        single_canindate = tokenized_current_window[0]
        single_score, single_val = similarity_score(bm25, single_canindate)
        if single_score == 0:
            results.append([])
        else:
            results.append(single_val)
    else:
        results.append([])

# Save Data

In [9]:
fields = ['Description', 'Ontology']
output_data = []
for a,b in zip(cve_arr, results):
    output_data.append([a,str(b)])

with open('cve_ontology.csv', 'w') as f: 
      
    write = csv.writer(f) 
    write.writerow(fields) 
    write.writerows(output_data)   

[] 

[] 

['web', 'cross', 'site', 'scripting'] 

['web', 'cross', 'site', 'scripting'] 

[] 

['password', 'brute', 'force', 'dictionary'] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[[], []] 

[] 

[] 

[] 

[[]] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[[], []] 

[] 

[] 

[] 

['web', 'cross', 'site', 'scripting'] 

[] 

[] 

[] 

['web', 'cross', 'site', 'scripting'] 

[] 

[] 

[] 

[] 

[] 

[] 

[['malware', 'ransomware', 'macro', 'virus', 'file', 'infectors', 'system', 'boot-record', 'polymorphic', 'sleath', 'trojan', 'horses', 'logic', 'bomb', 'worms', 'droppers', 'adware', 'spyware'], []] 

[] 

[] 

[[], []] 

[] 

[] 

[] 

[] 

[] 

[[], []] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[[]] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

[] 

['web', 'cross', 'site', 'scripting'] 

[] 

[['malware', 'ransomware', 'macro', 'virus', 'file', 'infectors', 'system', 'boot-record', 'polymorphic', 'sleath', 'trojan', 'horses', 'l