<a href="https://colab.research.google.com/github/aiforsec22/IEEEEuroSP23/blob/main/notebooks/malware-similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing dependencies

In [None]:
!git clone https://github.com/aiforsec/LADDER.git

In [None]:
%cd LADDER/attack_pattern/

### Import modules

In [None]:
import numpy as np

from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

### Read all malware, threat actor and triples

In [None]:
with open('all_malware.txt', 'r', encoding='utf-8') as f:
    text = f.read()
malware = []

for line in text.split('\n')[:-1]:
    malware.append(line)

In [None]:
len(malware)

In [None]:
with open('all_threat_actors.txt', 'r', encoding='utf-8') as f:
    text = f.read()
actors = []

for line in text.split('\n')[:-1]:
    actors.append(line)

In [None]:
len(actors)

In [None]:
def read_triples(fname):
    triples = []
    with open(fname, 'r', encoding='utf-8') as f:
        text = f.read()

    for line in text.split('\n'):
        if len(line) > 0:
            e1, r, e2 = line.split('\t')
            triples.append([e1, r, e2])
    return triples

In [None]:
triples = read_triples('150_all.txt')

In [None]:
def get_malware_nodes(mal, triples):
    nodes = set()
    
    for e1, r, e2 in triples:
        if e1 == mal:
            nodes.add((e2, r))
        elif e2 == mal:
            nodes.add((e1, r))
    return nodes

In [None]:
def get_all_malware_nodes(triples):
    malware_nodes = {}
    for m in malware:
        nodes = get_malware_nodes(m, triples)
        if len(nodes) > 0:
            malware_nodes[m] = nodes
    return malware_nodes

In [None]:
def get_all_actor_nodes(triples):
    actor_nodes = {}
    for m in actors:
        nodes = get_malware_nodes(m, triples)
        nodes_list = list(nodes)
        for x in nodes_list:
            if x[1] == 'hasAuthor':
                mal_nodes = get_malware_nodes(x[0], triples)
                for z in mal_nodes:
                    if z[1] in ['targets', 'uses', 'exploits', 'indicates', 'isA', 'variantOf',]:
                        nodes.add(z)
#                     else:
#                         nodes.add(x[0], )
        if len(nodes) > 0:
            actor_nodes[m] = nodes
    return actor_nodes

In [None]:
malware_nodes = get_all_malware_nodes(triples)

In [None]:
def get_distance(node1, node2, type='jaccard'):
    union = node1.union(node2)
    intersect = node1.intersection(node2)
    if type == 'intersect':
        return 1000-len(intersect)
    elif type == 'jaccard':
        return 1 - len(intersect)/len(union)
    elif type == 'overlap':
        return 1 - len(intersect)/min(len(node1), len(node2))

In [None]:
def find_most_similar_malware(mal, triple_fname):
    triples = read_triples(triple_fname)
    malware_nodes = get_all_malware_nodes(triples)

    malware_list = list(malware_nodes.keys())
    mal_node_i = malware_nodes[mal]
        
    dist = []    
    for j in range(len(malware_list)):
            mal_node_j = malware_nodes[malware_list[j]]
            dist.append([malware_list[j], get_distance(mal_node_i, mal_node_j, 'jaccard')])
    dist.sort(key=lambda x: x[1])
    
    return dist[1:6]

### Find the malware most similar to FluBot

In [None]:
find_most_similar_malware('FluBot', '12k_all.txt')

In [None]:
# should print the following
# [['TeaBot', 0.7906976744186046],
#  ['Medusa', 0.8064516129032258],
#  ['Gustuff', 0.8115942028985508],
#  ['Ghimob', 0.823943661971831],
#  ['Faketoken', 0.8260869565217391]]

In [None]:
triples = read_triples('12k_all.txt')
malware_nodes = get_all_malware_nodes(triples)

In [None]:
mal_node_i = malware_nodes['FluBot']
mal_node_j = malware_nodes['TeaBot']

print(mal_node_i.intersection(mal_node_j))

In [None]:
def find_most_similar_threat_actor(act, triple_fname):
    triples = read_triples(triple_fname)
    actor_nodes = get_all_actor_nodes(triples)
    
    actor_list = list(actor_nodes.keys())
    actor_node_i = actor_nodes[act]
    
    dist = []    
    for j in range(len(actor_list)):
            actor_node_j = actor_nodes[actor_list[j]]
            dist.append([actor_list[j], get_distance(actor_node_i, actor_node_j, 'jaccard')])
    dist.sort(key=lambda x: x[1])
    
    return dist[1:6]

### Find the most similar threat actor to APT15

In [None]:
find_most_similar_threat_actor('APT15', '12k_all.txt')

In [None]:
# should print the folliwng
# [['GREF', 0.5333333333333333],
#  ['Boyusec', 0.574468085106383],
#  ['Ke3chang', 0.5833333333333333],
#  ['APT-C-50', 0.8163265306122449],
#  ['Kitten', 0.8333333333333334]]

In [None]:
triples = read_triples('12k_all.txt')
actor_nodes = get_all_actor_nodes(triples)

In [None]:
mal_node_i = actor_nodes['APT15']
mal_node_j = actor_nodes['Boyusec']

print(mal_node_i.intersection(mal_node_j))