In [1]:
import sys
sys.version

'2.7.11 |Anaconda 2.4.1 (x86_64)| (default, Dec  6 2015, 18:57:58) \n[GCC 4.2.1 (Apple Inc. build 5577)]'

In [102]:
import os
import time
import math

import numpy as np
import scipy as sp
from scipy import sparse

from collections import Counter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix # added
from sklearn.cross_validation import LabelKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

import pickle
from sklearn.externals import joblib

# from tabulate import tabulate

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')

<div class="alert alert-info">
<p><strong>GENERAL SETUP</strong></p>
</div>

**Common functions**

In [69]:
def count_files(directory):
    '''
    Counts number of (valid) files in a given directory.
    '''
    
    result = 0
    
    # loop through each file in the directory
    for datafile in os.listdir(directory):

        # move on - no data here
        if datafile == '.DS_Store':
            continue
        else:
            result += 1
            
    return result

In [70]:
def write_predictions(predictions, ids, outfile):
    '''
    Writes predictions in the required format.
    
    Assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    
    If it already exists, outfile will be overwritten.
    '''
    
    # open file for outputing
    with open(outfile,'w+') as f:
        
        # write header
        f.write('Id,Prediction\n')
        
        # write predictions
        for i, history_id in enumerate(ids):
            f.write('%s,%d\n' % (history_id, predictions[i]))

In [71]:
def grid_search(model, parameters, train_x, train_y, folds=5, score_func=None):
    '''
    Generic function that is used to run n-fold cross-validation and select model
    parameters that minimize out-of-sample loss.
    
    From SKLearn documentation:
    By default, parameter search uses the score function of the estimator to evaluate a parameter setting.
    These are the sklearn.metrics.accuracy_score for classification and sklearn.metrics.r2_score for regression.
    
    Inputs
    ----------
    model: Any Scikit Learn predictive model
    parameters: Dictionary containing the parameters to be searched over
    train_x: Dataframe containing training set X-variables
    train_y: Dataframe containing training set Y-variables
    folds: Number of folds in cross-validation (default=5)
    score_func: Score function that can be used in evaluating parameters (default=None)
    '''
    
    # parameterize model (based on whether default scoring function is used)
    if score_func is None:
        model_xval = GridSearchCV(model, param_grid=parameters, cv=folds)
    else:
        model_xval = GridSearchCV(model, param_grid=parameters, cv=folds, scoring=score_func)
        
    # fit model
    model_xval.fit(train_x, train_y)
    
    # return optimal model parameters
    return model_xval.best_estimator_

In [103]:
def fit_model(model, train_x, train_y, test_x, test_y, use_test=True):
    '''
    Generic function that is used to fit a model and compute out-of-sample accuracy.
    
    Inputs
    ----------
    model: Any Scikit Learn predictive model (with or without custom parameters)
    train_x: Dataframe containing training set X-variables
    train_y: Dataframe containing training set Y-variables
    train_x: Dataframe containing test set X-variables
    train_y: Dataframe containing test set Y-variables
    debug: Determines whether model results are displayed or not
    '''
    # fit model
    model.fit(train_x, train_y)
    
    # calculate training set accuracy
    train_acc = accuracy_score(train_y, model.predict(train_x))
    
    # calculate test set accuracy
    if use_test:
        y_pred = model.predict(test_x)
        test_acc = accuracy_score(test_y, y_pred)
        print confusion_matrix(test_y, y_pred)
    else:
        test_acc = None
        
    # print results
    if use_test:
        print model
        print '----------'
        print 'Training set accuracy = %0.4f' % train_acc
        print 'Validation set accuracy = %0.4f' % test_acc
        print '----------'
    
    # return fitted model and training/test accuracy
    return model, train_acc, test_acc

**Common variables**

In [73]:
# class labels (malaware classes)
malware_classes = ['Agent', 'AutoRun', 'FraudLoad', 'FraudPack', 'Hupigon', 'Krap',
                   'Lipler', 'Magania', 'None', 'Poison', 'Swizzor', 'Tdss',
                   'VB', 'Virut', 'Zbot']

In [74]:
# keep track of all calls/tags in parsed XML
call_set = set([])

In [75]:
# file locations
TRAIN_DIR = 'data/train'
TEST_DIR = 'data/test'

In [76]:
# number of files per directory
train_files = count_files(TRAIN_DIR)
test_files = count_files(TEST_DIR)
valid_files = int(train_files * 0.7)
print train_files, test_files

3086 3724


<div class="alert alert-info">
<p><strong>FEATURE ENGINEERING</strong></p>
</div>

**Custom functions**

<div class="alert alert-success">
<p>&#9786; <strong>ADD FEATURE ENGINEERING FUNCTIONS HERE</strong> &#9786;</p>
</div>

### Pre-processing (ran once)

In [None]:
# created in pre-processing

opcodes = ['accept_socket',
 'add_netjob',
 'add_share',
 'all_section',
 'bind_socket',
 'change_service_config',
 'check_for_debugger',
 'com_create_instance',
 'com_createole_object',
 'com_get_class_object',
 'connect',
 'connect_share',
 'connect_socket',
 'control_service',
 'copy_file',
 'create_directory',
 'create_file',
 'create_interface',
 'create_key',
 'create_mailslot',
 'create_mutex',
 'create_namedpipe',
 'create_open_file',
 'create_process',
 'create_process_as_user',
 'create_process_nt',
 'create_service',
 'create_socket',
 'create_thread',
 'create_thread_remote',
 'create_window',
 'delete_file',
 'delete_key',
 'delete_service',
 'delete_share',
 'delete_value',
 'destroy_window',
 'download_file',
 'download_file_to_cache',
 'dump_line',
 'enum_handles',
 'enum_items',
 'enum_keys',
 'enum_modules',
 'enum_processes',
 'enum_services',
 'enum_share',
 'enum_subtypes',
 'enum_types',
 'enum_user',
 'enum_values',
 'enum_window',
 'exit_windows',
 'find_file',
 'find_window',
 'get_computer_name',
 'get_file_attributes',
 'get_host_by_addr',
 'get_host_by_name',
 'get_system_directory',
 'get_system_time',
 'get_userinfo',
 'get_username',
 'get_windows_directory',
 'impersonate_user',
 'kill_process',
 'listen_socket',
 'load_dll',
 'load_driver',
 'load_image',
 'logon_as_user',
 'message',
 'move_file',
 'open_file',
 'open_key',
 'open_mutex',
 'open_process',
 'open_scmanager',
 'open_service',
 'open_url',
 'ping',
 'process',
 'processes',
 'query_keyinfo',
 'query_value',
 'read_section',
 'read_section_names',
 'read_value',
 'recv_socket',
 'remove_directory',
 'revert_to_self',
 'send_socket',
 'set_file_attributes',
 'set_file_time',
 'set_system_time',
 'set_thread_context',
 'set_value',
 'set_windows_hook',
 'show_window',
 'sleep',
 'start_service',
 'thread',
 'trimmed_bytes',
 'unload_driver',
 'vm_allocate',
 'vm_mapviewofsection',
 'vm_protect',
 'vm_read',
 'vm_write',
 'write_value']

In [None]:
len(opcodes)

In [None]:
hex(0)

In [None]:
hexrep = {}
for i in xrange(len(opcodes)):
    if len(hex(i))==3:
        hexrep[opcodes[i]] = "0"+hex(i)[-1:]
    else:
        hexrep[opcodes[i]] = hex(i)[-2:]

In [None]:
for i in xrange(len(opcodes)):
    print opcodes[i], hexrep[opcodes[i]]

In [None]:
hexrep["unknown"]="ff"

In [None]:
def create_call_string(tree):
    '''
    Generates sequence of hexadecimal digit pairs (e.g. "6a") for each
    system specific call and creates a string feature that encodes the ordering
    of all calls made for a given file. Returns the string.
    '''
    
    # holds string of hex pairs
    callstring = ""

    # ignore section headers
    ignore = ['processes', 'process', 'thread', 'all_section']
    
    # loop through all calls/tags in the XML file
    for el in tree.iter():
        
        # extract the call/tag name
        call = el.tag
        
        if call not in ignore:
            # append hex code to string
            if call not in hexrep:
                callstring += hexrep['unknown']
            else:
                callstring += hexrep[call]  
            
    # print len(callstring)
    # return feature array (1 x D)
    return callstring

In [None]:
# create strings and save
X_dev, Y_dev, dev_ids = create_data_matrix(
    fteng_fn=create_call_string, direc=TRAIN_DIR, start_index=0, end_index=train_files)

df = pd.DataFrame(X_dev, dev_ids)
df.to_csv('hexstr.csv')

In [None]:
# create strings and save
X_dev2, Y_dev2, dev_ids2 = create_data_matrix(
    fteng_fn=create_call_string, direc=TEST_DIR, start_index=0, end_index=test_files)

df2 = pd.DataFrame(X_dev2, dev_ids2)
df2.to_csv('hexstr_test.csv')

In [None]:
codes2 = {}
codes3 = {}
codes4 = {}
codes5 = {}
codes6 = {}
codes7 = {}
codes8 = {}

In [None]:
test= "testings"
for key in range(0, len(test), 2):
    print test[key: key+2]
    

In [None]:
for i in xrange(df.shape[0]):
    s = df.iloc[i, 0]
    assert (len(s) % 2) == 0
    
    for key in range(0, len(s), 2):
        if (key + 4) <= len(s):
            ss = s[key: key+4]
            if ss not in codes2:
                codes2[ss] = 1  # dummy value
        if (key + 6) <= len(s):
            ss = s[key: key+6]
            if ss not in codes3:
                codes3[ss] = 1  # dummy value
        if (key + 8) <= len(s):
            ss = s[key: key+8]
            if ss not in codes4:
                codes4[ss] = 1  # dummy value
        if (key + 10) <= len(s):
            ss = s[key: key+10]
            if ss not in codes5:
                codes5[ss] = 1  # dummy value
        if (key + 12) <= len(s):
            ss = s[key: key+12]
            if ss not in codes6:
                codes6[ss] = 1  # dummy value
        if (key + 14) <= len(s):
            ss = s[key: key+14]
            if ss not in codes7:
                codes7[ss] = 1  # dummy value
        if (key + 16) <= len(s):
            ss = s[key: key+16]
            if ss not in codes8:
                codes8[ss] = 1  # dummy value
        
print len(codes2), len(codes3), len(codes4), len(codes5), len(codes6), len(codes7), len(codes8)

In [None]:
seqdata2 = np.zeros((df.shape[0], len(codes2)), dtype=int)
seqdata4 = np.zeros((df.shape[0], len(codes4)), dtype=int)
seqdata6 = np.zeros((df.shape[0], len(codes6)), dtype=int)
seqdata8 = np.zeros((df.shape[0], len(codes8)), dtype=int)

In [None]:
seqdf2 = pd.DataFrame(seqdata2, columns=codes2.keys())
seqdf4 = pd.DataFrame(seqdata4, columns=codes4.keys())
seqdf6 = pd.DataFrame(seqdata6, columns=codes6.keys())
seqdf8 = pd.DataFrame(seqdata8, columns=codes8.keys())

In [None]:
print seqdf2.shape, seqdf4.shape, seqdf6.shape, seqdf8.shape

In [None]:
for i in xrange(df.shape[0]):
    s = df.iloc[i, 0]
    assert (len(s) % 2) == 0
    
    for key in range(0, len(s), 2):
        if (key + 4) <= len(s):
            ss = s[key: key+4]
            seqdf2[ss][i] += 1  

        if (key + 8) <= len(s):
            ss = s[key: key+8]
            seqdf4[ss][i] += 1 

        if (key + 12) <= len(s):
            ss = s[key: key+12]
            seqdf6[ss][i] += 1  

        if (key + 16) <= len(s):
            ss = s[key: key+16]
            seqdf8[ss][i] += 1   

In [None]:
seqdf2.to_csv('hex2seqs.csv')
seqdf4.to_csv('hex4seqs.csv')
seqdf6.to_csv('hex6seqs.csv')
seqdf8.to_csv('hex8seqs.csv')

### reload feature data

In [None]:
df2 = pd.read_csv("hex2seqs.csv")
df4 = pd.read_csv("hex4seqs.csv")
df6 = pd.read_csv("hex6seqs.csv")
df8 = pd.read_csv("hex8seqs.csv")

In [None]:
print df2.shape, df4.shape, df6.shape, df8.shape

In [None]:
df4.head()

In [None]:
df2.drop('Unnamed: 0', axis=1, inplace=True)
df4.drop('Unnamed: 0', axis=1, inplace=True)
df6.drop('Unnamed: 0', axis=1, inplace=True)
df8.drop('Unnamed: 0', axis=1, inplace=True)
df4.head()

In [None]:
newdf = pd.concat([df2, df4, df6, df8], axis=1)
print newdf.shape
newdf.head()

In [None]:
newdf.to_csv("bigtrain.csv")

### construct test data

In [3]:
newdf = pd.read_csv("bigtrain.csv")
colnames = list(newdf)

In [4]:
colnames

['Unnamed: 0',
 '4e1a',
 '1e2c',
 '6654',
 '0f45',
 '0541',
 '585b',
 '4d5b',
 '3c4b',
 '3c4c',
 '3c4a',
 '0549',
 '1658',
 '3c4d',
 '473b',
 '473c',
 '473a',
 '072e',
 '1f24',
 '231f',
 '540c',
 '231c',
 '3b09',
 '4838',
 '2b14',
 '6355',
 '6157',
 '4114',
 '0720',
 '0724',
 '4735',
 '4738',
 '2312',
 '2310',
 '5854',
 '054d',
 '3c49',
 '054a',
 '3c43',
 '3c41',
 '5858',
 '3c47',
 '3c45',
 '0058',
 '6a2e',
 '244c',
 '244a',
 '6a2b',
 '0e5c',
 '6359',
 '3b3b',
 '3b3c',
 '3b3a',
 '3b3f',
 '3b3e',
 '481c',
 '204d',
 '1f4c',
 '204c',
 '481e',
 '204a',
 '6059',
 '0616',
 '0610',
 '2443',
 '2441',
 '2449',
 '0e57',
 '0e54',
 '0938',
 '4812',
 '4810',
 '4817',
 '2043',
 '2041',
 '061e',
 '061c',
 '061b',
 '3b33',
 '3b36',
 '3b37',
 '3b35',
 '3b38',
 '0937',
 '4b4b',
 '4b4c',
 '4b4a',
 '0f0f',
 '0f0e',
 '2b5c',
 '4c0f',
 '2c2b',
 '3b2b',
 '3512',
 '3510',
 '3517',
 '3516',
 '3514',
 '162c',
 '162b',
 '6206',
 '6207',
 '162e',
 '5d43',
 '5d41',
 '6208',
 '1436',
 '0f06',
 '0f07',
 '1437',
 '4b

In [5]:
newdf.drop('Unnamed: 0', axis=1, inplace=True)
colnames = list(newdf)
print colnames, len(colnames)

['4e1a', '1e2c', '6654', '0f45', '0541', '585b', '4d5b', '3c4b', '3c4c', '3c4a', '0549', '1658', '3c4d', '473b', '473c', '473a', '072e', '1f24', '231f', '540c', '231c', '3b09', '4838', '2b14', '6355', '6157', '4114', '0720', '0724', '4735', '4738', '2312', '2310', '5854', '054d', '3c49', '054a', '3c43', '3c41', '5858', '3c47', '3c45', '0058', '6a2e', '244c', '244a', '6a2b', '0e5c', '6359', '3b3b', '3b3c', '3b3a', '3b3f', '3b3e', '481c', '204d', '1f4c', '204c', '481e', '204a', '6059', '0616', '0610', '2443', '2441', '2449', '0e57', '0e54', '0938', '4812', '4810', '4817', '2043', '2041', '061e', '061c', '061b', '3b33', '3b36', '3b37', '3b35', '3b38', '0937', '4b4b', '4b4c', '4b4a', '0f0f', '0f0e', '2b5c', '4c0f', '2c2b', '3b2b', '3512', '3510', '3517', '3516', '3514', '162c', '162b', '6206', '6207', '162e', '5d43', '5d41', '6208', '1436', '0f06', '0f07', '1437', '4b49', '4b45', '4b43', '4b41', '3207', '3206', '620f', '1620', '5401', '1624', '5d4a', '4c5d', '4a16', '5d4d', '165b', '351c',

In [6]:
teststrings = pd.read_csv('hexstr_test.csv')
teststrings.head()

Unnamed: 0.1,Unnamed: 0,0
0,0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f,4543434343434343434343434343434343434343434343...
1,001f298a534ae4b0db7f2707169250aa215c3b5f2,454343434343434343434343434343434343436a6a6a6a...
2,001f5fdaaa8bbe20303527198d09a30bb7ca3eb50,45434343434343434343434343434343433b384a544343...
3,002ca2c41b649f85c05ae30013436781a932fecc6,45434343434343434343434343431c6a6a6a6a6a6a6a6a...
4,003e109543b4ea22d2bcc1ec309bf2fd34e9a1a1d,454343434343434343434343434343434306433b4a4a54...


In [7]:
teststrings.shape

(3724, 2)

In [8]:
temp = np.zeros((teststrings.shape[0], newdf.shape[1]), dtype=int)
temp.shape

(3724, 191855)

In [9]:
testsetdf = pd.DataFrame(temp, columns=list(newdf))

In [10]:
testsetdf.shape

(3724, 191855)

In [11]:
colset = dict((el,0) for el in list(newdf))
assert len(colset)==len(list(newdf))

In [12]:
for i in xrange(testsetdf.shape[0]):
    s = teststrings.iloc[i, 1]
    assert (len(s) % 2) == 0
    print "processing row %i with string length %i" % (i, len(s))
    
    for key in range(0, len(s), 2):
        if (key + 4) <= len(s):
            ss = s[key: key+4]
            if ss in colset:
                testsetdf[ss][i] += 1

        if (key + 8) <= len(s):
            ss = s[key: key+8]
            if ss in colset:
                testsetdf[ss][i] += 1

        if (key + 12) <= len(s):
            ss = s[key: key+12]
            if ss in colset:
                testsetdf[ss][i] += 1 

        if (key + 16) <= len(s):
            ss = s[key: key+16]
            if ss in colset:
                testsetdf[ss][i] += 1

processing row 0 with string length 1390
processing row 1 with string length 1242
processing row 2 with string length 78
processing row 3 with string length 2640
processing row 4 with string length 154
processing row 5 with string length 1232
processing row 6 with string length 37828
processing row 7 with string length 72
processing row 8 with string length 186
processing row 9 with string length 406
processing row 10 with string length 158
processing row 11 with string length 74
processing row 12 with string length 358
processing row 13 with string length 2930
processing row 14 with string length 2724
processing row 15 with string length 102
processing row 16 with string length 2194
processing row 17 with string length 556
processing row 18 with string length 1120
processing row 19 with string length 376
processing row 20 with string length 1422
processing row 21 with string length 186
processing row 22 with string length 55862
processing row 23 with string length 22332
processing row

In [13]:
testsetdf.to_csv("bigtest.csv")

In [14]:
testsetdf.head()

Unnamed: 0,4e1a,1e2c,6654,0f45,0541,585b,4d5b,3c4b,3c4c,3c4a,0549,1658,3c4d,473b,473c,473a,072e,1f24,231f,540c,231c,3b09,4838,2b14,6355,6157,4114,0720,0724,4735,4738,2312,2310,5854,054d,3c49,054a,3c43,3c41,5858,3c47,3c45,0058,6a2e,244c,244a,6a2b,0e5c,6359,3b3b,3b3c,3b3a,3b3f,3b3e,481c,204d,1f4c,204c,481e,204a,6059,0616,0610,2443,2441,2449,0e57,0e54,0938,4812,4810,4817,2043,2041,061e,061c,061b,3b33,3b36,3b37,3b35,3b38,0937,4b4b,4b4c,4b4a,0f0f,0f0e,2b5c,4c0f,2c2b,3b2b,3512,3510,3517,3516,3514,162c,162b,6206,...,4a5407604a4a5454,07334a434a544a54,4938493849384343,373b353b35354935,433c434a5407434a,272766581024334a,544a0a4f630a4f4a,3535354a54331e3b,4a1254605460364a,324a4a54436a6a6a,434a4a172433413b,544a543c4a3c5454,3b3e4a4a5460544a,4a0707493c4a5407,3317413338380f10,666349633c4a5407,634a544a3f4a543b,351c454343434343,6a6a6a6a6a434333,1f351f1f1f353535,543b4a54074a4a54,3c3c43434343436a,434343434a4d1a64,4a5407434a4a5443,6a6a6a6a6a6a1c6a,3362364c33364c3b,3b064a5454434a4a,1e1e1e4a124a6043,633a1b43040c5827,364c3f3b1f4a543c,57574a54544a4a35,490720434a32493e,54073c33434a4a54,43331e3349494a4a,43353b3537404938,43634d634d634d63,105d38105d38105d,49434a4a544a3c3c,60604a2a4a544a54,54331e3358272727,37494949493e3749,3324332433454343,434a546358272727,381e1e1e62336233,1f101b040c3c5b27,3362364c33364c33,49351f381f0f4935,4a32204a4a4a4a4a,434a1260434a541b,434a4a2020202020,664343070707074a,544a540e1f49103b,4a4a0743074a4a4a,5409070907540709,272766334a544a54,4a544a54541e4937,364c3b333b24334a,0c3c631b0c3c0c3c,4a07074a5407544a,686c686c686c6868,0e49494a4a544949,381f1f1f1f1f0f10,434a545443144b49,4a074a540a0a4a54,4335432b432b4343,275b273b43434543,434a4a43434a5454,54071e43434a5435,582727275b272758,090707074a4a5454,0f350f350f5c5c5c,4a17494145434343,634a35634a2a3563,4b4a54544a4a143b,544a334a540a0a33,4a4a33631c3a4a54,1c1c4a54544a4a14,434943434a541b4a,433f074a544a5438,074949434307430a,3535383538160f38,4a545407434a5443,4c364c3324332449,686c686c686c686c,66634a54074a5407,1c3c4a54433c4a54,2a4a4a540949634a,5461434a543b433b,601c1c1c4a54544a,1063481e36363636,4343433b3849434a,1e62434338164338,1e331e334a4a574a,5454094349383838,4a43434a4a4a5454,4a544a541e61614a,4f634a54073b4343,354b434343063838,5d5c10495d5c101f,1c1c254958272727
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
test_ids = teststrings["Unnamed: 0"].values
test_ids

array(['0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f',
       '001f298a534ae4b0db7f2707169250aa215c3b5f2',
       '001f5fdaaa8bbe20303527198d09a30bb7ca3eb50', ...,
       'ff85866b215233b2fecdca2c2b8fda54ad24c86fd',
       'ff884224571e0476990574df5da76e0991db583af',
       'ffc47163a530c51ef2e6572d786aefbaed99890f2'], dtype=object)

In [19]:
test_ids.shape

(3724,)

In [85]:
test_idsdf = pd.DataFrame(test_ids)

In [87]:
test_idsdf.to_csv("testids.csv")

In [21]:
get_trainy = pd.read_csv("train_yvals.csv")
get_trainy.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,8
1,1,6
2,2,12
3,3,8
4,4,10


In [22]:
get_trainy.drop('Unnamed: 0', axis=1, inplace=True)

array([[ 8],
       [ 6],
       [12],
       ..., 
       [ 8],
       [ 8],
       [ 3]])

In [46]:
Y_train = np.array(get_trainy["0"])

In [47]:
Y_train

array([ 8,  6, 12, ...,  8,  8,  3])

**Sample function provided: counts 2 particular calls**

In [None]:
# get classification responses from training set
def gen_y(tree):
    return None

In [None]:
def call_feats(tree):
    '''
    Sample feature engineering function.
    Returns the number of system specific calls made by the programs.
    '''
    
    # these are the calls we are looking for
    good_calls = ['sleep', 'dump_line']

    # keep track of calls
    call_counter = {}

    # loop through all calls/tags in the XML file
    for el in tree.iter():
        
        # extract the call/tag name
        call = el.tag
        
        # count the number of calls to each tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    # initialize the feature array (1 x D)        
    call_feat_array = np.zeros(len(good_calls))
    
    # loop through the calls we are looking for
    for i, call in enumerate(good_calls):
        
        # update counter with the number of times the call was seen
        if call in call_counter:
            call_feat_array[i] = call_counter[call]
        else:
            call_feat_array[i] = 0

    # print call_counter.items()
    # return feature array (1 x D)
    return call_feat_array

**Generic functions for processing files (do not modify)**

In [None]:
def add_to_set(tree):
    '''
    Keeps track of all seen call types.
    '''
    
    # loop through all calls/tags in the XML file
    for el in tree.iter():
        # extract the call/tag name
        call = el.tag
        # update the list of call types
        call_set.add(call)

In [None]:
def create_data_matrix(fteng_fn, direc='data/train', start_index=-1, end_index=4000):
    '''
    Creates X and Y matrices based on the files present in a given directory.
    
    By default, all files will be processed, but a subset can be specified for
    faster testing or for training-validation splits.    
    '''

    # initialize X (feature) matrix (N x D)
    X = None
    
    # initialize Y (class) matrix (N x 1)
    classes = []
    
    # keep track of all file id's processed
    ids = []
    
    # keep track of the index of the file that is being processed
    # only really relevant when using a sub-set of the data
    i = -1
    
    # loop through each file in the directory
    for datafile in os.listdir(direc):
        
        # move on - no data here
        if datafile == '.DS_Store':
            continue

        # check that we are in bounds
        i += 1
        if i < start_index:
            continue
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        
        # keep track of all file id's processed
        ids.append(id_str)
        
        # add target class if this is training data
        try:
            classes.append(malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an 'X' label
            assert clazz == 'X'
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        
        # keept track of all calls/tags we have seen
        add_to_set(tree)
        
        # this is where features get created
        this_row = fteng_fn(tree)
        
        # add features to X matrix
        if X is None:
            X = this_row
        else:
            X = np.vstack((X, this_row))

    # return X (N x D), Y (N x 1), list of id's parsed
    return X, np.array(classes), ids

**Test feature extraction on a small number of files**

In [None]:
# use to test feature extraction on a small number of files

# extract data - modify fteng_fn
#X_dev, Y_dev, dev_ids = create_data_matrix(
#    fteng_fn=call_feats, direc=TRAIN_DIR, start_index=0, end_index=5)

X_dev, Y_dev, dev_ids = create_data_matrix(
    fteng_fn=gen_y, direc=TRAIN_DIR, start_index=0, end_index=train_files)

df_y = pd.DataFrame(Y_dev)
df_trainids = pd.DataFrame(dev_ids)
#print X_dev
#print Y_dev

In [None]:
df_y.to_csv("train_yvals.csv")

In [None]:
df_trainids.to_csv("train_ids.csv")

## generate Xtrain and Ytrain, Xtest

In [48]:
X_trainraw = newdf.values

In [49]:
X_trainraw[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [50]:
X_trainraw.shape

(3086, 191855)

In [60]:
# X_train = (X_trainraw - X_trainraw.mean(axis=0)) / X_trainraw.std(axis=0, ddof=1)
X_train = X_trainraw

In [61]:
X_train.shape

(3086, 191855)

In [None]:
# Y_train = Y_dev

In [53]:
Y_train.shape

(3086,)

In [55]:
X_testraw = testsetdf.values

In [56]:
X_testraw[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [62]:
X_test = X_testraw

In [63]:
X_test.shape

(3724, 191855)

In [64]:
X_test[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [65]:
X_train_all = X_train.copy()
Y_train_all = Y_train.copy()

In [66]:
print X_train_all.shape, Y_train_all.shape

(3086, 191855) (3086,)


In [67]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all, Y_train_all, train_size=0.7, random_state=1004)

<div class="alert alert-info">
<p><strong>DATAFRAMES FOR PREDICTION</strong></p>
</div>

In [None]:
# modify with your favorite function!
chosen_fteng_fn = call_feats

**Create features for final predictions**  
We do this first to make sure that the call set is populated based on all the possible data.

In [None]:
# extract data for entire training set
X_train_all, Y_train_all, train_ids_all = create_data_matrix(
    fteng_fn=chosen_fteng_fn, direc=TRAIN_DIR, start_index=0, end_index=train_files)

# extract data for test set
X_test, Y_test, test_ids = create_data_matrix(
    fteng_fn=chosen_fteng_fn, direc=TEST_DIR, start_index=0, end_index=test_files)

**Create features for model tuning**

In [None]:
# extract data for training set
X_train, Y_train, train_ids = create_data_matrix(
    fteng_fn=chosen_fteng_fn, direc=TRAIN_DIR, start_index=0, end_index=valid_files)

# extract data for validation set
X_valid, Y_valid, valid_ids = create_data_matrix(
    fteng_fn=chosen_fteng_fn, direc=TRAIN_DIR, start_index=valid_files, end_index=train_files)

In [None]:
# visualize distribution of categories in training set
plt.hist(Y_train, normed=True, width=0.8, bins=15)
plt.xlim(xmin=-1, xmax=15)
plt.xlabel('Classification Label')
plt.ylabel('Frequency')
# plt.title('Distribution of Malaware Classifications - Training Set')
plt.show()

In [None]:
# visualize distribution of categories in validation set
plt.hist(Y_valid, normed=True, width=0.8, bins=15)
plt.xlim(xmin=-1, xmax=15)
plt.xlabel('Classification Label')
plt.ylabel('Frequency')
# plt.title('Distribution of Malaware Classifications - Validation Set')
plt.show()

<div class="alert alert-info">
<p><strong>PREDICTION MODELS</strong></p>
</div>

<div class="alert alert-danger">
<p><strong>Warning: the models below are set up to carry out a grid search over model parameters. Make sure you check the lists of parameters that are being passed before running!</strong></p>
</div>

**Logistic regression**

In [82]:
start = time.time()

# update model class and/or parameters to search over here
model = LogisticRegression()
penalties = ['l1', 'l2'] # only 2 options
cs = [1.0]
# cs = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0] # default = 1.0

# cross-validation on training set to identify optimal parameters
model = grid_search(model, {'penalty': penalties, 'C': cs}, X_train, Y_train)

# fit model on training set with optimal parameters
# check out-of-sample performance using validation set
model, train_acc, test_acc = fit_model(model, X_train, Y_train, X_valid, Y_valid)

# fit model on entire training set with optimal parameters and make predictions
model, train_acc, test_acc = fit_model(model, X_train_all, Y_train_all, None, None, use_test=False)
Y_test_pred = model.predict(X_test)
write_predictions(Y_test_pred, test_ids, 'results/baseline_logistic.csv')

print '%0.1f seconds runtime' % (time.time() - start)

# option to save fitted model
# joblib.dump(model, 'models/baseline_logistic.pkl');

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
----------
Training set accuracy = 0.9755
Validation set accuracy = 0.8715
----------
1020.1 seconds runtime


**Random forest classifier**

In [77]:
start = time.time()

# update model class and/or parameters to search over here
model = RandomForestClassifier()
estimators = [10] # default = 10
features = ['sqrt'] # default = 'sqrt'
depths = [None] # default = None (i.e. ignored)
samples = [1] # default = 1

# cross-validation on training set to identify optimal parameters
model = grid_search(model, {'n_estimators': estimators, 'max_features': features,
                            'max_depth': depths, 'min_samples_leaf': samples}, X_train, Y_train)

# fit model on training set with optimal parameters
# check out-of-sample performance using validation set
model, train_acc, test_acc = fit_model(model, X_train, Y_train, X_valid, Y_valid)

# fit model on entire training set with optimal parameters and make predictions
model, train_acc, test_acc = fit_model(model, X_train_all, Y_train_all, None, None, use_test=False)
Y_test_pred = model.predict(X_test)
write_predictions(Y_test_pred, test_ids, 'results/baseline_random_forest.csv')

print '%0.1f seconds runtime' % (time.time() - start)

# option to save fitted model
# joblib.dump(model, 'models/baseline_random_forest.pkl');

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
----------
Training set accuracy = 0.9889
Validation set accuracy = 0.8931
----------
90.1 seconds runtime


In [78]:
preds = pd.read_csv("results/baseline_random_forest.csv")

In [79]:
preds

Unnamed: 0,Id,Prediction
0,0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f,8
1,001f298a534ae4b0db7f2707169250aa215c3b5f2,10
2,001f5fdaaa8bbe20303527198d09a30bb7ca3eb50,8
3,002ca2c41b649f85c05ae30013436781a932fecc6,0
4,003e109543b4ea22d2bcc1ec309bf2fd34e9a1a1d,8
5,004070b468d6bb29848c76cfCd5887849c7bb648d,10
6,00461dd05c981edde167a5947c365472141e04bb1,5
7,005b95d2520C8621171566f5803437b0c443778e1,8
8,0071a3b818ed06d3865a24fdb31d4147c67fabfc5,12
9,007436715ec13cedd38344772a2144a3d79f3ea68,8


In [80]:
preds.shape

(3724, 2)

**Extra trees classifier**

In [90]:
start = time.time()

# update model class and/or parameters to search over here
model = ExtraTreesClassifier()
estimators = [20] # default = 10
features = ['log2', 'sqrt', 0.5, 0.7] # default = 'sqrt'
depths = [None] # default = None (i.e. ignored)
samples = [1] # default = 1
criterion = ['gini', 'entropy'] # default = 'gini'
njobs = [-1]  # default = 1, -1 is the number of cores

# cross-validation on training set to identify optimal parameters
model = grid_search(model, {'n_estimators': estimators, 'max_features': features,
                            'max_depth': depths, 'min_samples_leaf': samples, 
                            'criterion': criterion, 'n_jobs': njobs}, X_train, Y_train)

# fit model on training set with optimal parameters
# check out-of-sample performance using validation set
model, train_acc, test_acc = fit_model(model, X_train, Y_train, X_valid, Y_valid)

# fit model on entire training set with optimal parameters and make predictions
model, train_acc, test_acc = fit_model(model, X_train_all, Y_train_all, None, None, use_test=False)
Y_test_pred = model.predict(X_test)
write_predictions(Y_test_pred, test_ids, 'results/baseline_extra_trees_2.csv')

print '%0.1f seconds runtime' % (time.time() - start)

# option to save fitted model
# joblib.dump(model, 'models/baseline_extra_trees.pkl');

KeyboardInterrupt: 

**AdaBoost classifier**

In [84]:
start = time.time()

# update model class and/or parameters to search over here
model = AdaBoostClassifier()
estimators = [50] # default = 50
rates = [1.0] # default = 1.0

# cross-validation on training set to identify optimal parameters
model = grid_search(model, {'n_estimators': estimators, 'learning_rate': rates}, X_train, Y_train)

# fit model on training set with optimal parameters
# check out-of-sample performance using validation set
model, train_acc, test_acc = fit_model(model, X_train, Y_train, X_valid, Y_valid)

# fit model on entire training set with optimal parameters and make predictions
model, train_acc, test_acc = fit_model(model, X_train_all, Y_train_all, None, None, use_test=False)
Y_test_pred = model.predict(X_test)
write_predictions(Y_test_pred, test_ids, 'results/baseline_adaboost.csv')

print '%0.1f seconds runtime' % (time.time() - start)

# option to save fitted model
# joblib.dump(model, 'models/baseline_adaboost.pkl');

KeyboardInterrupt: 

### fine tuning

In [None]:
start = time.time()

# update model class and/or parameters to search over here
model = ExtraTreesClassifier()
estimators = [50] # [20] # default = 10
features = ['sqrt', 0.5] # [0.5] # default = 'sqrt', ['log2', 'sqrt', 0.5, 0.7]
depths = [None] # default = None (i.e. ignored)
samples = [1] # default = 1
criterion = ['gini', 'entropy'] # default = 'gini', other option = 'entropy'
# njobs = [1]  # default = 1, -1 is the number of cores
# weights = [{0: 0.27, 1:0.61, 2:0.83, 3:0.97, 4:0.75, 5:0.79, 6:0.58, 
#           7:0.75, 8:0.02, 9:1.47, 10:0.057, 11:0.96, 12:0.08, 13:0.52, 
#           14:0.77}]
# , 'n_jobs': njobs, 'class_weight': weights

# cross-validation on training set to identify optimal parameters
model = grid_search(model, {'n_estimators': estimators, 'max_features': features,
                            'max_depth': depths, 'min_samples_leaf': samples, 
                            'criterion': criterion}, X_train, Y_train)

# fit model on training set with optimal parameters
# check out-of-sample performance using validation set
model, train_acc, test_acc = fit_model(model, X_train, Y_train, X_valid, Y_valid)

# fit model on entire training set with optimal parameters and make predictions
model, train_acc, test_acc = fit_model(model, X_train_all, Y_train_all, None, None, use_test=False)
Y_test_pred = model.predict(X_test)
write_predictions(Y_test_pred, test_ids, 'results/50tuning.csv')

print '%0.1f seconds runtime' % (time.time() - start)

class_weight: weight given

ExtraTreesClassifier(bootstrap=False,
           class_weight={0: 0.27, 1: 0.61, 2: 0.83, 3: 0.97, 4: 0.75, 5: 0.79, 6: 0.58, 7: 0.75, 8: 0.02, 9: 1.47, 10: 0.057, 11: 0.96, 12: 0.08, 13: 0.52, 14: 0.77},
           criterion='gini', max_depth=None, max_features='sqrt',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
----------
Training set accuracy = 0.9806
Validation set accuracy = 0.8866
----------
139.5 seconds runtime