In [1]:
import os
import pandas as pd
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import util

In [2]:
# Write a function which takes an XML root and returns True/False
# Should evaluate whether or not it is part of a malware class
def check_swizzor(root) :
    process = []
    for child in root :
        if type(child) is not str :
            if 'filename' in child.attrib.keys() :
                process.append(child.attrib['filename'])
        
    for i in range(len(process) - 1) :
        first = process[i]
        second = process[i+1]
        
        if "iexplore.exe" in first and ("services.exe" in second or "svchost.exe" in second):
            return True
            break
    
    return False

In [3]:
# Just run this, you don't need to modify anything 
# Basically this function applies the checking function against part
# or all of the training matrix
def check_all(start_index, end_index, direc, func):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break
        
        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        root = ET.parse(os.path.join(direc,datafile)).getroot()
        this_row = func(root)
        if X is None:
            X = this_row
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

In [9]:
# Checks the accuracy of your technique!
# Substitute the correct class integer and your checking function
# See below for an example
def accuracyRate(func, correctClass=0, n=100) :
    X_train, y_train, ids = check_all(0,n,'train',func)

    pred = [el[0] for el in X_train.tolist()]
    isActual = y_train == correctClass

    df = pd.DataFrame()
    df['id'] = ids
    df['pred'] = pred
    df['actual'] = isActual.tolist()
    df['correct'] = df['actual'] == df['pred']
    
    print "False Positives Rate: " + str(len(df.loc[(df['actual'] == False) & (df['pred'] == True)])/float(n))
    print "False Negative Rate: " + str(len(df.loc[(df['actual'] == True) & (df['pred'] == False)])/float(n))
    print "Correct ID Rate: " + str(len(df.loc[df['correct'] == True])/float(n))
    print ""

In [10]:
%%time
# checks the accuracy of function check_swizzor
# swizzor's class number is #10
# there are 3086 entries in the training matrix
accuracyRate(func=check_swizzor, correctClass=10, n=3086)

False Positives Rate: 0.00810110174984
False Negative Rate: 0.00324044069994
Correct ID Rate: 0.98865845755
CPU times: user 20.9 s, sys: 1.49 s, total: 22.4 s
Wall time: 26.2 s
