In [1]:
import os
import util
from scipy import sparse
import numpy as np
from collections import Counter
import pandas as py
from matplotlib import pyplot as plt

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

def check_all(start_index, end_index, direc):
    classes = []
    ids = [] 
    processes = []
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break
        
        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        tree = ET.parse(os.path.join(direc,datafile))
        
        processList = []
        for el in tree.iter() :
            processList.append(el.tag)
        processes.append(processList)

    return np.array(classes), ids, processes

In [192]:
import pandas as pd 

y, ids, processes = check_all(0, 3086, 'train')
lengths = [len(process) for process in processes]

df = pd.DataFrame()
df['class'] = y
df['ids'] = ids
df['lengths'] = lengths

In [235]:
def contains_sublist(lst, sublst):
    n = len(sublst)
    return any([(sublst == lst[i:i+n]) for i in xrange(len(lst)-n+1)])

def substring(n,m,minLength=5) :
    start = 0
    while start < len(n) - 5 :
        exists = True
        while exists == True and (start + minLength) < len(n):
            exists = contains_sublist(m, n[start:start+minLength])
            minLength += 1
        start += 1
    return minLength/float(len(n))

def most_common(lst):
    return max(set(lst), key=lst.count)

def capMask(vector, limit) :
    if np.sum(vector) > limit :
        falseMask = random.sample(np.where(np.array(mask) == True)[0].tolist(),np.sum(vector) - limit)
        for sample in falseMask :
            vector[sample] = False
        return vector
    else :
        return vector

def bestMatch(unknown, allProcesses, lengths, y, df, minLength, error) :
    u_length = len(unknown)
    mask = [True if (length-error) <= u_length <= (length+error) else False for length in lengths]
    
    if np.sum(mask) == 1 :
        return [el[1] for el in zip(mask,y) if el[0] == True][0]
    elif u_length < 1500 :
        mask = capMask(mask, 50)
        dftouse = df[mask]
        processtouse = [el[1] for el in zip(mask,allProcesses) if el[0] == True]

        withScores = dftouse.copy()
        withScores['scores'] = [substring(unknown, process) for process in processtouse]
        topClasses = withScores[withScores.scores > 0.8].sort(['scores'], ascending=False)['class'].tolist()
        return most_common(topClasses)
    else :
        return 'Too hard! :('

In [214]:
for i in range(100) :
    u_length = len(processes[i])
    mask = [True if (length-error) <= u_length <= (length+error) else False for length in lengths]
    print np.sum(mask)

(728, 2)
(12137, 1)
(97, 236)
(484, 1)
(1136, 3)
(23, 5)
(716, 24)
(1313, 2)
(713, 30)
(702, 99)
(702, 99)
(715, 21)
(77, 5)
(9926, 1)
(631, 8)
(28, 2)
(706, 132)
(936, 1)
(709, 44)
(117, 2)
(7301, 1)
(97, 236)
(2909, 1)
(716, 24)
(634, 11)
(1342, 1)
(706, 132)
(705, 63)
(2027, 1)
(7132, 1)
(40, 43)
(561, 1)
(704, 10)
(57, 7)
(97, 236)
(192, 11)
(3593, 1)
(709, 44)
(75, 4)
(702, 99)
(8251, 1)
(1456, 26)
(6756, 1)
(97, 236)
(294, 1)
(1456, 26)
(87, 5)
(560, 1)
(705, 63)
(358, 1)
(706, 132)
(2304, 1)
(714, 52)
(196, 2)
(637, 14)
(12985, 1)
(81, 5)
(89, 5)
(90, 3)
(710, 48)
(805, 1)
(1736, 6)
(9591, 1)
(717, 13)
(709, 44)
(722, 14)
(281, 1)
(190, 2)
(703, 32)
(1268, 2)
(39, 6)
(712, 37)
(97, 236)
(713, 30)
(714, 52)
(708, 25)
(1370, 1)
(292, 2)
(712, 37)
(6876, 1)
(707, 56)
(710, 48)
(97, 236)
(109, 14)
(723, 12)
(97, 236)
(635, 8)
(707, 56)
(1221, 1)
(9730, 1)
(708, 25)
(709, 44)
(710, 48)
(1503, 3)
(43, 1)
(504, 2)
(642, 7)
(1194, 2)
(4292, 1)
(707, 56)


In [236]:
for i in range(0,100) :
    print "Actual: " + str(y[i])
    print "Pred: " + str(bestMatch(processes[i], processes, lengths, y, df, minLength=5, error=0))
    print ""

Actual: 8
Pred: 8

Actual: 6
Pred: 6

Actual: 12


ValueError: sample larger than population

In [241]:
u_length = len(processes[2])
mask = [True if (length-error) <= u_length <= (length+error) else False for length in lengths]
new_mask = capMask(mask, 50)

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 

In [234]:
new_mask = capMask(mask, 50)
np.sum(new_mask)

50

In [50]:
processes[0:100]

[['processes',
  'process',
  'thread',
  'all_section',
  'load_image',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'load_dll',
  'open_file',
  'get_windows_directory',
  'check_for_debugger',
  'load_dll',
  'get_system_directory',
  'open_key',
  'open_key',
  'query_value',
  'create_mutex',
  'create_mutex',
  'create_mutex',
  'create_mutex',
  'create_mutex',
  'open_key',
  'query_value',
  'query_value',
  'get_system_directory',
  'open_key',
  'query_value',
  'create_mutex',
  'set_windows_hook',
  'set_windows_hook',
  'open_key',
  'query_value',
  'load_dll',
  'get_system_directory',
  'load_dll',
  'get_system_directory',
  'load_dll',
  'load_dll',
  'get_system_directory',
  'create_window',