In [13]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [14]:
def ld_data (dir_name):
    # Load data in files from directory dir_name in a pandas dataframe
    df = pd.DataFrame()
    for filename in os.listdir(dir_name):
        funcs = []
        with open(os.path.join(dir_name, filename), 'r') as f:
            lines = f.read().splitlines()
            funcs = [line.strip().split(' ')[1][2:-1] for line in lines if not '@' in line.strip().split(' ')[1][2:-1] and not '?' in line.strip().split(' ')[1][2:-1]]

        # Concat all functions as string in df
        label = 'MALWARE' if filename.startswith('R-') else 'GOODWARE'
        df = pd.concat([df, pd.DataFrame({'filename': filename, 'label': label, 'funcs': ' '.join(funcs)}, index=[0])], ignore_index=True)
        
    return df


In [15]:
train_dt_dir_name = '../data/executable-functions/'
test_dt_dir_name = '../data/executable-functions-test/'
# test_dt_dir_name = '/archive/files/imports/'

In [16]:
train = ld_data(train_dt_dir_name)
test = ld_data(test_dt_dir_name)

In [17]:
print(len(test))
print(len(test[test['label'] == 'MALWARE']))
print(len(test[test['label'] == 'GOODWARE']))
test_goodware = test[test['label'] == 'GOODWARE']
test_malware = test[test['label'] == 'MALWARE']

97
53
44


In [18]:
# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train['funcs']).ceil()
test_g_vectors = vectorizer.transform(test_goodware['funcs']).ceil()
test_m_vectors = vectorizer.transform(test_malware['funcs']).ceil()

print(vectorizer.get_feature_names_out())
print(train['filename'][0])
print(train_vectors.toarray()[0])

['___lc_codepage_func' '___lc_collate_cp_func' '___lc_handle_func' ...
 'zwupdatewnfstatedata' 'zwwaitforsingleobject' 'zwwritefile']
R-LordEK.sample
[0. 0. 0. ... 0. 0. 0.]


In [26]:
# Train the classifier
classifier = DecisionTreeClassifier()

classifier.fit(train_vectors, train['label'])
# Print information about the decision tree
# plot_tree(classifier)
# plt.savefig('decision_tree.svg')

In [27]:
pred_g = classifier.predict(test_g_vectors)
pred_m = classifier.predict(test_m_vectors)

# Evaluate the classifier
print('Accuracy goodwares: {}'.format(accuracy_score(test_goodware['label'], pred_g)))
print('Accuracy malwares: {}'.format(accuracy_score(test_malware['label'], pred_m)))

Accuracy goodwares: 1.0
Accuracy malwares: 1.0


In [28]:
print(vectorizer.get_feature_names_out()[7333])

rtlcapturecontext


In [30]:
# Extract rules to classify MALWARE
rules = []
for i in range(len(classifier.tree_.feature)):
    if classifier.tree_.feature[i] != -2:
        feature = vectorizer.get_feature_names_out()[classifier.tree_.feature[i]]
        threshold = classifier.tree_.threshold[i]
        rules.append((feature, threshold))

print(rules)


[('rtllookupfunctionentry', 0.5), ('__c_specific_handler', 0.5), ('zwclose', 0.5), ('mmmapiospaceex', 0.5), ('disablethreadlibrarycalls', 0.5), ('widechartomultibyte', 0.5), ('mbstowcs', 0.5), ('rtlunwind', 0.5), ('__initenv', 0.5), ('getipnettable', 0.5), ('__std_exception_copy', 0.5), ('send', 0.5), ('tracemessage', 0.5), ('lockresource', 0.5), ('closeservicehandle', 0.5)]
