In [2]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
from sklearn import svm
from sklearn.neighbors import NearestNeighbors
import util

TRAIN_DIR = "train"

call_set = set([])

In [3]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

In [4]:
def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids


In [5]:
def call_feats(tree):
    good_calls = [
    'processes', 'query_value', 'read_section', 'read_section_names', 'read_value', 
        'recv_socket', 'remove_directory', 'revert_to_self', 'send_socket', 
        'set_file_attributes', 'set_file_time', 'set_system_time', 'set_thread_context', 
        'set_value', 'set_windows_hook', 'show_window', 'sleep', 'start_service', 'thread',
        'trimmed_bytes', 'unload_driver', 'vm_allocate', 'vm_mapviewofsection', 'vm_protect',
        'vm_read', 'vm_write', 'write_value']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1
            
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array




In [6]:
def calculate_score(prediction, actual):
    return sum([1 if prediction[i] == actual[i] else 0 for i in range(len(prediction))]) / float(len(prediction))

In [7]:
X_train, t_train, train_ids = create_data_matrix(0, 2777, TRAIN_DIR)
X_valid, t_valid, valid_ids = create_data_matrix(2777, 3086, TRAIN_DIR)

print 'Data matrix (training set):'
print X_train
print 'Classes (training set):'
print t_train

Data matrix (training set):
[[  1.00000000e+00   2.42000000e+02   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  1.00000000e+00   2.00400000e+03   1.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   3.00000000e+00]
 [  1.00000000e+00   7.00000000e+00   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 ..., 
 [  1.00000000e+00   5.34000000e+02   0.00000000e+00 ...,   0.00000000e+00
    5.00000000e+00   0.00000000e+00]
 [  1.00000000e+00   2.60000000e+02   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  1.00000000e+00   2.42000000e+02   0.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
Classes (training set):
[ 8  6 12 ..., 10  8  8]


In [8]:
# Try SVM
clf = svm.SVC()
clf.fit(X_train, t_train)
predictions = clf.predict(X_valid)
calculate_score(predictions, t_valid)

0.7993527508090615

In [9]:
# Try Nearest Neighbor
nn = NearestNeighbors(n_neighbors=1)
nn.fit(X_train, t_train)
nearest_neighbors = nn.kneighbors(X_valid, return_distance=False)
predictions = [t_train[nearest_neighbors[0]] for nearest_neighbor in nearest_neighbors]
calculate_score(predictions, t_valid)

0.18446601941747573

In [11]:
# Run this to see what call_feats returns
direc = "train"
for idx, datafile in enumerate(os.listdir(direc)):
    if idx == 1:
        print datafile
        tree = ET.parse(os.path.join(direc,datafile))
        feats = call_feats(tree)
        
#         for el in tree.iter():
#             print el.tag
        print feats

00278ec420236020d6121dffe0cc20034422e7228.Lipler.xml
[  1.00000000e+00   2.00400000e+03   1.00000000e+00   0.00000000e+00
   5.10000000e+01   2.11000000e+02   1.00000000e+00   0.00000000e+00
   1.70000000e+01   2.00000000e+01   1.80000000e+01   0.00000000e+00
   0.00000000e+00   7.00000000e+01   2.80000000e+01   4.00000000e+00
   2.54000000e+02   0.00000000e+00   4.10000000e+01   2.05000000e+02
   0.00000000e+00   0.00000000e+00   0.00000000e+00   3.65000000e+02
   0.00000000e+00   0.00000000e+00   3.00000000e+00]
