In [97]:
import os
import pandas as pd
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
import util
from matplotlib import pyplot as plt
%matplotlib inline

In [40]:
def viewProcess(datafile) :
    direc = 'train'
    tree = ET.parse(os.path.join(direc,datafile))
    for el in tree.iter():
        print el.tag

In [41]:
# Allow you to check the filenames of a file name
# Need to use the FULL filename including .class.xml
def viewProcessNames(datafile) :
    direc = 'train'
    root = ET.parse(os.path.join(direc,datafile)).getroot()

    for child in root :
        if type(child) is not str :
            if 'filename' in child.attrib.keys() :
                print child.attrib['filename']

In [None]:
# Write a function which takes an XML root and returns True/False
# Should evaluate whether or not it is part of a malware class
def n_Processes(tree) :
    root = tree.getroot()
    process = []
    for child in root :
        if type(child) is not str :
            if 'filename' in child.attrib.keys() :
                process.append(child.attrib['filename'])
        
    return len(process)

In [45]:
# Write a function which takes an XML root and returns True/False
# Should evaluate whether or not it is part of a malware class
def check_swizzor(tree) :
    root = tree.getroot()
    process = []
    for child in root :
        if type(child) is not str :
            if 'filename' in child.attrib.keys() :
                process.append(child.attrib['filename'])
        
    for i in range(len(process) - 1) :
        first = process[i]
        second = process[i+1]
        
        if "iexplore.exe" in first and ("services.exe" in second or "svchost.exe" in second):
            return True
            break
    
    return False

In [46]:
# Just run this, you don't need to modify anything 
# Basically this function applies the checking function against part
# or all of the training matrix
def check_all(start_index, end_index, direc, func):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break
        
        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        tree = ET.parse(os.path.join(direc,datafile))
        this_row = func(tree)
        if X is None:
            X = this_row
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

In [36]:
# Checks the accuracy of your technique!
# Substitute the correct class integer and your checking function
# See below for an example
def accuracyRate(func, correctClass=0, n=100) :
    X_train, y_train, ids = check_all(0,n,'train',func)

    pred = [el[0] for el in X_train.tolist()]
    isActual = y_train == correctClass

    df = pd.DataFrame()
    df['id'] = ids
    df['pred'] = pred
    df['actual'] = isActual.tolist()
    df['realClass'] = y_train
    df['correct'] = df['actual'] == df['pred']
    
    print "False Positives Rate: " + str(len(df.loc[(df['actual'] == False) & (df['pred'] == True)])/float(n))
    print "False Negative Rate: " + str(len(df.loc[(df['actual'] == True) & (df['pred'] == False)])/float(n))
    print "Correct ID Rate: " + str(len(df.loc[df['correct'] == True])/float(n))
    print ""
    return df

In [136]:
def viewFalseN(df) :
    return df.loc[(df['actual'] == True) & (df['pred'] == False)]

def viewFalseP(df) :
    return df.loc[(df['actual'] == False) & (df['pred'] == True)]

def viewCorrect(df) :
    return df.loc[(df['actual'] == True) & (df['pred'] == True)]

In [226]:
%%time
# checks the accuracy of function check_swizzor
# swizzor's class number is #10
# there are 3086 entries in the training matrix
df = accuracyRate(func=check_swizzor, correctClass=10, n=3086)

False Positives Rate: 0.00810110174984
False Negative Rate: 0.00324044069994
Correct ID Rate: 0.98865845755

CPU times: user 20.8 s, sys: 1.24 s, total: 22.1 s
Wall time: 26 s


In [227]:
print viewFalseP(df).shape
print viewFalseN(df).shape

(25, 5)
(10, 5)


In [120]:
# Write a function which takes an XML root and returns True/False
# Should evaluate whether or not it is part of a malware class
def check_VB2(tree) :
    root = tree.getroot()
    tags = []
    process = []
    
    for el in tree.iter() :
        tags.append(el.tag)

    for child in root :
        if type(child) is not str :
            if 'filename' in child.attrib.keys() :
                process.append(child.attrib['filename'])
    
    for i in range(len(tags) - 1) :
        first = tags[i]
        second = tags[i+1]
        
        if "destroy_window" in first and "destroy_window" in second :
            if len(process) == 1 :
                return True
                break
    
    return False

# Notes on VB 
# seems to be very similar to "None"
# Not very good so far ... damn

In [141]:
%%time
df = accuracyRate(func=check_VB, correctClass=12, n=3086)

False Positives Rate: 0.0777705767984
False Negative Rate: 0.0408295528192
Correct ID Rate: 0.881399870382

CPU times: user 20.6 s, sys: 1.31 s, total: 21.9 s
Wall time: 25.7 s


In [212]:
# Write a function which takes an XML root and returns True/False
# Should evaluate whether or not it is part of a malware class
def check_autorun(tree) :
    root = tree.getroot()
    
    tags = []
    process = []
    
    for child in root :
        if type(child) is not str :
            if 'filename' in child.attrib.keys() :
                process.append(child.attrib['filename'])
    
    keywords = ['taskkill']
    
    for i in range(len(process)) :
        for word in keywords :
            if word in process[i] :
                return True
                break
    
    return False

# Notes on AutoRun
# Not bad, only 6 false positives but about 5 times as many false N
# As many false negatives as correct IDs

In [213]:
%%time
df = accuracyRate(func=check_autorun, correctClass=1, n=3086)

False Positives Rate: 0.00194426441996
False Negative Rate: 0.00745301360985
Correct ID Rate: 0.99060272197

CPU times: user 21.2 s, sys: 1.28 s, total: 22.4 s
Wall time: 26.2 s


In [215]:
viewProcessNames('2b5ac6a2cc219da8fbb3d29ebea207de7c6e0924b.Virut.xml')

c:\bd2a1362e59b1660859d82cad03db947.EX
C:\WINDOWS\system32\svchost.exe
C:\WINDOWS\Microsoft.NET\Framework\v2.0.50727\ngen.exe install MMCEx, Version=3.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL /noDependencies
C:\WINDOWS\Microsoft.NET\Framework\v2.0.50727\mscorsvw.exe -UseCLSID {64D9463A-CB1F-4E7E-97B2-88EEC749D9EF} -Comment Dependency Analyzer
C:\WINDOWS\Microsoft.NET\Framework\v2.0.50727\mscorsvw.exe -UseCLSID {B27DB6E1-605C-47B5-8905-8BE908B7B1F1} -Comment Compile worker for MMCEx, Version=3.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL
C:\WINDOWS\Microsoft.NET\Framework\v2.0.50727\ngen.exe install MMCFxCommon, Version=3.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL /noDependencies
C:\WINDOWS\Microsoft.NET\Framework\v2.0.50727\mscorsvw.exe -UseCLSID {25478E7E-EF50-4BEA-BF8D-38EE08768225} -Comment Dependency Analyzer
C:\WINDOWS\Microsoft.NET\Framework\v2.0.50727\mscors

In [216]:
viewProcessNames('5c1444dfae977f4c7d0262cb4bdb9778208df2edd.Virut.xml')

c:\dfd4d1666eb593b65b0cce0746775aca.EX
C:\WINDOWS\system32\svchost.exe
C:\WINDOWS\system32\services.exe


In [219]:
viewProcessNames('9c5874e60c7f44755c71c3f5d99ce3978501a717a.Virut.xml')

c:\eea7219b677bd8d25316cbca7f1563f2.EX


In [225]:
viewProcessNames('b89c6244c441835ee0c8a0359a6de49a899323505.Virut.xml')

c:\dc7c7a0c7a90d2e310c95eb5d8b67a74.EX
