In [6]:
import os
import pandas as pd
import numpy as np
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
from scipy import sparse
import util

In [7]:
TRAIN_DIR = "../data/train"

In [8]:
call_set = set([])

In [9]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

In [10]:
#creating a set of features counting the number of tags
def call_feats(tree, good_calls):
    #Inputs
    #tree - tree object for every file
    #good_calls - list of tags for which we create the features
    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]
    return call_feat_array

In [11]:
###Creating function for loading data
def create_matrix(start_index, end_index, tags, direc="../data/train"):
    X = None
    classes = []
    ids = []
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue
            
        i += 1
        if i < start_index:
            continue
        if i >= end_index:
            break
        id_str, clas = datafile.split('.')[:2]
        ids.append(id_str)
        #adding target class to training data
        try:
            classes.append(util.malware_classes.index(clas))
        except ValueError:
            assert clas == "X"
            classes.append(-1)
            
        #parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree, tags)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))
            
    return X, np.array(classes), ids    

#### Data Exploration

In [12]:
#Data exploration
#Loading the files
fileList = os.listdir(TRAIN_DIR)
numFiles = len(fileList)

In [13]:
#List of unique tags
tags = []
for idx in range(numFiles):
    tree = ET.parse(os.path.join(TRAIN_DIR,fileList[idx]))
    for el in tree.iter():
        call = el.tag
        tags.append(call)
    tags = list(np.unique(tags))
unique_tags = np.unique(tags)

In [14]:
#Converting all tags to 'str' from  numpy.string_
unique_tags = [str(tag) for tag in unique_tags]

## After looking through the log files manually, a few ideas for features came to me.  I add these below:

In [15]:
# I noticed that all the Swizzor files have this really characteristic dump line of evenly spaced characters.  I
#   tried taking the average length of each dump line.  
dump_line_len = np.zeros(len(fileList))
for idx in range(numFiles):
    dump_lens=[]
    tree = ET.parse(os.path.join(TRAIN_DIR,fileList[idx]))
    for el in tree.iter():
        call = el.tag
        if call == 'dump_line':
            dump_lens.append(len(el.items()[1][1]))
    dump_lens_mean = np.mean(dump_lens)
    if np.isnan(dump_lens_mean): 
        dump_line_len[idx]=0
    else:
        dump_line_len[idx]=dump_lens_mean
Mean_Dump_Line_Length = pd.Series(dump_line_len,name="Mean_Dump_Line_Length")



In [16]:
## Total amount of time spent sleeping in each file
total_sleep_time = np.zeros(len(fileList))
for idx in range(numFiles):
    sleep_sum=[]
    tree = ET.parse(os.path.join(TRAIN_DIR,fileList[idx]))
    for el in tree.iter():
        call = el.tag
        if call == 'sleep':
            sleep_sum.append(np.int(el.items()[0][1]))
    total_sleep_time[idx]= np.sum(sleep_sum)
    
Total_Sleep_Time = pd.Series(total_sleep_time,name="Total_Sleep_Time")

In [17]:
def time_parser(time):
    ## Returns time in seconds from a time string
    ## Takes in a time in the format 'mm:ss.ddd' and returns a float of seconds
    ## ex: time_parser('02:13.250') = 133.25
    minutes = float(time[0:2])
    seconds = float(time[-6:])
    return 60*minutes+seconds

In [18]:
## Total run-time for each file.  
total_run_time = np.zeros(numFiles)
for idx in range(numFiles):
    runtimes=[]
    tree = ET.parse(os.path.join(TRAIN_DIR,fileList[idx]))
    for el in tree.iter():
        end=0
        start=0
        call = el.tag
        if call == 'process':
            for item in el.items():
                if item[0]=='starttime':
                    start=time_parser(item[1])
                if item[0]=='terminationtime':
                    end=time_parser(item[1])
        runtimes.append(end - start)
    total_run_time[idx] = sum(runtimes)
Total_Run_Time = pd.Series(total_run_time,name="Total_Run_Time")           
            

In [24]:
## Total run-time for each file.  
# total_run_time = np.zeros(numFiles)
num_failed_createprocesses = np.zeros(numFiles)
mean_failed_createprocesses = np.zeros(numFiles)
for idx in range(numFiles):
    runtimes=[]
    tree = ET.parse(os.path.join(TRAIN_DIR,fileList[idx]))
    successfuls=[]
    for el in tree.iter():
        end=0
        start=0
        call = el.tag
        if call == 'create_process':
            for item in el.items():
                if item[0]=='successful': 
                    successfuls.append(int(item[1]))
    if not np.isnan(np.mean(successfuls)):
        mean_failed_createprocesses[idx] = np.mean(successfuls)
    else: mean_failed_createprocesses[idx]=0
    num_failed_createprocesses[idx] = sum(np.array(successfuls)==0)
                    

Mean_Failed_CreateProcesses = pd.Series(mean_failed_createprocesses,name="Mean_Failed_Createprocesses")  
Num_Failed_CreateProcesses = pd.Series(num_failed_createprocesses,name="Num_Failed_Createprocesses")
            

## I'll put these three features onto the DataFrame

In [25]:
features_df = pd.read_csv('../outputs/features_v1.csv')

In [26]:
features_df['Total_Run_Time']=Total_Run_Time

In [27]:
features_df['Total_Sleep_Time']=Total_Sleep_Time

In [28]:
features_df['Mean_Dump_Line_length'] =Mean_Dump_Line_Length

In [29]:
features_df['Mean_Failed_CreateProcesses']=Mean_Failed_CreateProcesses
features_df['Num_Failed_CreateProcesses']=Num_Failed_CreateProcesses

## Now I save the dataframe with the new features

In [30]:
features_df.to_csv('../outputs/features_vJunge1.csv')

## Now I'll add the same features to the Test set

In [31]:
features_test_df = pd.read_csv('../outputs/features_test_v1.csv')

In [32]:
TEST_DIR = "../data/test"
testFileList = os.listdir(TEST_DIR)
numTestFiles = len(testFileList)

In [33]:
dump_line_len = np.zeros(numTestFiles)
for idx in range(numTestFiles):
    dump_lens=[]
    tree = ET.parse(os.path.join(TEST_DIR,testFileList[idx]))
    for el in tree.iter():
        call = el.tag
        if call == 'dump_line':
            dump_lens.append(len(el.items()[1][1]))
    dump_lens_mean = np.mean(dump_lens)
    if np.isnan(dump_lens_mean): 
        dump_line_len[idx]=0
    else:
        dump_line_len[idx]=dump_lens_mean
Mean_Dump_Line_Length = pd.Series(dump_line_len,name="Mean_Dump_Line_Length")

In [34]:
## Total amount of time spent sleeping in each file
total_sleep_time = np.zeros(numTestFiles)
for idx in range(numTestFiles):
    sleep_sum=[]
    tree = ET.parse(os.path.join(TEST_DIR,testFileList[idx]))
    for el in tree.iter():
        call = el.tag
        if call == 'sleep':
            sleep_sum.append(np.int(el.items()[0][1]))
    total_sleep_time[idx]= np.sum(sleep_sum)
    
Total_Sleep_Time = pd.Series(total_sleep_time,name="Total_Sleep_Time")

In [286]:
## Total run-time for each file.  
total_run_time = np.zeros(numTestFiles)
for idx in range(numTestFiles):
    runtimes=[]
    tree = ET.parse(os.path.join(TEST_DIR,testFileList[idx]))
    for el in tree.iter():
        end=0
        start=0
        call = el.tag
        if call == 'process':
            for item in el.items():
                if item[0]=='starttime':
                    start=time_parser(item[1])
                if item[0]=='terminationtime':
                    end=time_parser(item[1])
        runtimes.append(end - start)
    total_run_time[idx] = sum(runtimes)
Total_Run_Time = pd.Series(total_run_time,name="Total_Run_Time")           
      

In [35]:
## Total run-time for each file.  
# total_run_time = np.zeros(numFiles)
num_failed_createprocesses = np.zeros(numTestFiles)
mean_failed_createprocesses = np.zeros(numTestFiles)
for idx in range(numTestFiles):
    runtimes=[]
    tree = ET.parse(os.path.join(TEST_DIR,testFileList[idx]))
    successfuls=[]
    for el in tree.iter():
        end=0
        start=0
        call = el.tag
        if call == 'create_process':
            for item in el.items():
                if item[0]=='successful': 
                    successfuls.append(int(item[1]))
    if not np.isnan(np.mean(successfuls)):
        mean_failed_createprocesses[idx] = np.mean(successfuls)
    else: mean_failed_createprocesses[idx]=0
    num_failed_createprocesses[idx] = sum(np.array(successfuls)==0)
                    

Mean_Failed_CreateProcesses = pd.Series(mean_failed_createprocesses,name="Mean_Failed_Createprocesses")  
Num_Failed_CreateProcesses = pd.Series(num_failed_createprocesses,name="Num_Failed_Createprocesses")
            

In [289]:
features_test_df['Num_Failed_CreateProcesses']=Num_Failed_CreateProcesses
features_test_df['Mean_Failed_CreateProcesses']=Mean_Failed_CreateProcesses
features_test_df['Mean_Dump_Line_Length']=Mean_Dump_Line_Length
features_test_df['Total_Run_Time']=Total_Run_Time
features_test_df['Total_Sleep_Time']=Total_Sleep_Time

In [36]:
features_test_df.to_csv('../outputs/features_test_vJunge1.csv')

## Below is some exploration I did looking at the features class by class.  I did all this before generating the features above, and nothing really useful came of it.  

In [16]:
grouped = features_df.groupby('class')

In [21]:
feature_means = grouped.mean()

In [27]:
feature_means.head()

Unnamed: 0_level_0,Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,...,start_service,thread,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1405.77193,0,0,12.596491,2.526316,0.008772,2.552632,28.947368,0,7.017544,...,0.210526,12.842105,39.105263,0,5.754386,0,1099.078947,48.438596,12.114035,5.833333
1,1493.28,0,0,9.24,4.04,0.0,0.9,8.76,0,0.18,...,0.0,9.34,4.04,0,6.28,0,138.7,0.02,6.0,0.48
2,1689.891892,0,0,16.297297,8.351351,0.0,2.810811,109.243243,0,5.810811,...,0.0,16.972973,89.351351,0,0.108108,0,295.756757,0.216216,0.0,0.027027
3,1701.21875,0,0,9.0,25.90625,0.0,0.1875,215.71875,0,7.9375,...,0.0,9.0,154.25,0,0.0,0,200.5,0.0,0.0,0.0
4,1633.878049,0,0,4.487805,1.268293,0.0,0.853659,9.04878,0,0.0,...,0.0,4.756098,4.707317,0,0.0,0,44.0,0.0,0.073171,0.0


In [39]:
# Helper function to tell us if the means of any feature values has only 2 unique values, in other words
#    are all of the values 0 except for one class.  
def has_unique(series):
    return len(np.unique(series))==2

In [44]:
unique_features = []
for i in unique_tags: 
    if has_unique(feature_means[i]): unique_features.append(feature_means[i].name)

In [45]:
unique_features

['accept_socket',
 'create_interface',
 'create_process_as_user',
 'delete_share',
 'get_userinfo',
 'unload_driver',
 'vm_mapviewofsection']

In [46]:
feature_means[unique_features]

Unnamed: 0_level_0,accept_socket,create_interface,create_process_as_user,delete_share,get_userinfo,unload_driver,vm_mapviewofsection
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.051282,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.006837,0.000622,0.0,0.0,0.000622,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0


So, if any of the data contains one of these features at all, we can automatically assign it to the class that is the only one that every has that feature.  Unfortunately, most of these features have very low average values, so they don't occur frequently in the data.

In [47]:
feature_means

Unnamed: 0_level_0,Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,...,start_service,thread,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1405.77193,0.0,0.0,12.596491,2.526316,0.008772,2.552632,28.947368,0,7.017544,...,0.210526,12.842105,39.105263,0.0,5.754386,0.0,1099.078947,48.438596,12.114035,5.833333
1,1493.28,0.0,0.0,9.24,4.04,0.0,0.9,8.76,0,0.18,...,0.0,9.34,4.04,0.0,6.28,0.0,138.7,0.02,6.0,0.48
2,1689.891892,0.0,0.0,16.297297,8.351351,0.0,2.810811,109.243243,0,5.810811,...,0.0,16.972973,89.351351,0.0,0.108108,0.0,295.756757,0.216216,0.0,0.027027
3,1701.21875,0.0,0.0,9.0,25.90625,0.0,0.1875,215.71875,0,7.9375,...,0.0,9.0,154.25,0.0,0.0,0.0,200.5,0.0,0.0,0.0
4,1633.878049,0.0,0.0,4.487805,1.268293,0.0,0.853659,9.04878,0,0.0,...,0.0,4.756098,4.707317,0.0,0.0,0.0,44.0,0.0,0.073171,0.0
5,1686.128205,0.0,0.0,7.589744,2.974359,0.0,0.948718,34.384615,0,2.384615,...,0.102564,7.641026,32.487179,0.051282,1.410256,0.0,124.410256,0.025641,1.051282,0.0
6,1644.566038,0.0,0.0,35.490566,12.867925,0.0,2.622642,263.509434,0,173.660377,...,0.0,35.490566,453.075472,0.0,0.056604,0.0,236.90566,0.0,0.018868,5.867925
7,1373.195122,0.0,0.0,4.219512,0.02439,0.02439,0.02439,1.97561,0,0.04878,...,0.04878,4.219512,0.0,0.0,0.0,0.0,58.365854,0.0,0.0,0.0
8,1537.185208,0.006837,0.001243,3.698571,0.303294,0.002486,0.865755,6.969546,0,1.162834,...,0.006215,3.712865,10.543816,0.0,0.060907,0.0,31.362958,0.039776,0.054692,1.555003
9,1564.285714,0.0,0.0,2.761905,0.0,0.0,0.619048,1.190476,0,0.333333,...,0.0,2.761905,0.0,0.0,59.380952,0.0,106.666667,0.0,34.952381,0.0


In [None]:
## Below is a closer examination of some of the existing features

In [117]:
grouped['dump_line'].max()

class
0     13976
1      2345
2     22092
3     14312
4      2363
5      9965
6     22106
7         1
8     57669
9         2
10     3376
11     2466
12     1226
13       53
14      761
Name: dump_line, dtype: float64

In [128]:
grouped['dump_line'].apply(lambda x: np.percentile(x,33))

class
0        0.00
1        0.00
2        0.00
3       38.36
4        0.00
5        0.00
6     1525.36
7        0.00
8        0.00
9        0.00
10      30.00
11       0.00
12       0.00
13       0.00
14       0.00
Name: dump_line, dtype: float64

In [106]:
feature_means['dump_line']

class
0      664.833333
1      105.620000
2     1490.810811
3     2611.406250
4       82.634146
5      587.256410
6     7521.471698
7        0.024390
8      174.270976
9        0.095238
10     893.832103
11     158.812500
12       5.489362
13       2.033898
14      93.425000
Name: dump_line, dtype: float64

In [107]:
feature_means['sleep']

class
0      355.157895
1      100.980000
2      962.837838
3      100.187500
4       10.341463
5     1789.435897
6      201.150943
7        5.731707
8       61.876321
9        5.761905
10      11.457565
11       6.468750
12      26.385638
13       0.762712
14     419.875000
Name: sleep, dtype: float64

In [108]:
feature_means['create_process']

class
0      2.456140
1      9.360000
2      2.945946
3      0.281250
4      0.658537
5      1.307692
6      2.471698
7      0.073171
8      0.205718
9      0.523810
10    12.354244
11     0.500000
12    24.409574
13     0.271186
14     0.075000
Name: create_process, dtype: float64

In [109]:
feature_means['query_value']

class
0      447.631579
1       62.640000
2     1175.891892
3      670.875000
4       66.951220
5      503.846154
6     1987.886792
7      209.878049
8      206.079553
9      113.666667
10     319.476015
11      38.218750
12      70.936170
13     110.559322
14     220.000000
Name: query_value, dtype: float64

In [111]:
feature_means['delete_file']

class
0      49.412281
1       0.140000
2       4.756757
3       0.312500
4       0.707317
5       1.717949
6      18.641509
7       1.829268
8      20.950901
9       0.476190
10    173.225092
11      1.937500
12      1.625000
13      0.474576
14      1.250000
Name: delete_file, dtype: float64

In [249]:
grouped.count().id

class
0      114
1       50
2       37
3       32
4       41
5       39
6       53
7       41
8     1609
9       21
10     542
11      32
12     376
13      59
14      40
Name: id, dtype: int64

In [60]:
feature_stds = grouped.std()

In [61]:
feature_stds

Unnamed: 0_level_0,Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,...,start_service,thread,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,850.033529,0.0,0.0,26.02576,10.320707,0.093659,8.364582,93.84169,0,34.023153,...,1.222747,26.40246,133.929473,0.0,24.681175,0.0,6934.397338,516.049202,63.46482,43.026009
1,961.077,0.0,0.0,5.981468,19.992815,0.0,0.614452,15.339106,0,1.272792,...,0.0,6.492099,19.992815,0.0,21.73499,0.0,523.380154,0.141421,19.892569,1.668924
2,913.579225,0.0,0.0,15.062582,19.327896,0.0,3.152053,155.400273,0,8.188196,...,0.0,15.919043,249.467345,0.0,0.657596,0.0,295.275333,1.315192,0.0,0.164399
3,914.245044,0.0,0.0,7.22451,41.399984,0.0,0.396558,351.038205,0,10.413817,...,0.0,7.22451,251.386991,0.0,0.0,0.0,102.165268,0.0,0.0,0.0
4,893.3023,0.0,0.0,4.000762,1.597254,0.0,1.108174,56.987697,0,0.0,...,0.0,4.247237,22.768667,0.0,0.0,0.0,54.890801,0.0,0.468521,0.0
5,904.066369,0.0,0.0,8.848782,9.99865,0.0,0.944478,54.214785,0,3.476352,...,0.446912,8.877788,125.915776,0.223456,5.204846,0.0,162.859274,0.160128,4.154571,0.0
6,887.419856,0.0,0.0,27.579691,11.293357,0.0,2.021294,201.110247,0,143.387713,...,0.0,27.579691,435.705699,0.0,0.412082,0.0,162.516894,0.0,0.137361,30.186623
7,925.899023,0.0,0.0,2.444506,0.156174,0.156174,0.156174,5.354847,0,0.218085,...,0.312348,2.444506,0.0,0.0,0.0,0.0,39.945435,0.0,0.0,0.0
8,888.198176,0.27423,0.04986,5.85133,3.943317,0.078821,1.526915,57.513493,0,28.162368,...,0.179721,5.976183,133.78664,0.0,1.224245,0.0,46.938909,1.002315,0.848318,22.708752
9,792.778658,0.0,0.0,4.526641,0.0,0.0,1.687489,3.5017,0,0.966092,...,0.0,4.526641,0.0,0.0,255.253889,0.0,393.308827,0.0,147.210895,0.0


In [66]:
features_df[features_df['class']==9]['vm_allocate']

283     1173
544        0
632        0
651        0
666        0
1009       0
1144       0
1246       0
1295       0
1356       0
1364      20
1743      17
1753       0
1925      14
2057      17
2086       0
2275       6
2479       0
2485       0
2856       0
3001       0
Name: vm_allocate, dtype: float64

I looked through several features that had mean values for a particular class that were very far from the mean of that feature for the data as a whole, and found that they were typically thrown off by a few extreme values for a particular data point.  

In [70]:
feature_means['dump_line']

class
0      664.833333
1      105.620000
2     1490.810811
3     2611.406250
4       82.634146
5      587.256410
6     7521.471698
7        0.024390
8      174.270976
9        0.095238
10     893.832103
11     158.812500
12       5.489362
13       2.033898
14      93.425000
Name: dump_line, dtype: float64

In [71]:
features_df['dump_line'][features_df['class']==6]

1        3433
104         0
192     10690
253         0
265         0
362     13568
410     16103
517     10578
606     19240
625      4387
827         0
877         0
878         0
895      3970
922      3467
1120    11683
1148    15004
1153     4354
1157    15891
1359     3580
1412    16665
1484     4459
1521        0
1549    13404
1563     6737
1592        0
1755        0
1770    18277
1817     9449
1855    15627
1876     4509
1932     4412
2031    11757
2061        0
2191        0
2213     3577
2234    10418
2258        0
2273    15451
2319    22106
2395        0
2470        0
2551        0
2593    16582
2659    11290
2716     1162
2806     7557
2827    11022
2851        0
2914    16315
2932        0
2999    22006
3072    19908
Name: dump_line, dtype: float64

In [76]:
feature_means.sum(axis=1)

class
0      5771.307018
1      3393.300000
2      9158.000000
3      7636.187500
4      2237.731707
5      9551.230769
6     18718.226415
7      2048.585366
8      2734.945308
9      2156.952381
10     4126.044280
11     2133.750000
12     2447.505319
13     2101.525424
14    10549.500000
dtype: float64

#### Random forest classifiers

In [37]:
features_df.head()

Unnamed: 0.1,Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,...,vm_read,vm_write,write_value,class,id,Total_Run_Time,Total_Sleep_Time,Mean_Dump_Line_length,Mean_Failed_CreateProcesses,Num_Failed_CreateProcesses
0,0,0,0,5,0,0,1,1,0,0,...,0,0,0,8,00269ea50001a6c699d0222032d45b74b2e7e8be9,16.623,252,0,1.0,0
1,1,0,0,40,5,0,3,338,0,207,...,0,0,2,6,00278ec420236020d6121dffe0cc20034422e7228,443.124,305540,48,0.75,1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,12,002d5615d19c851934dc481c607b6a74a6e9e536e,2.515,0,0,0.0,0
3,3,0,0,1,0,0,0,0,0,0,...,0,0,0,8,006be5Dc265600c19728c9747fb4c7bc9e8d6f106,234.516,0,0,0.0,0
4,4,0,0,6,0,0,1,4,0,1,...,0,4,0,10,0089453df77890cae95ce7d9130a4ef85eaea36e8,237.125,60500,48,0.5,1


In [38]:
X_train = np.array(features_df.drop(['class','id'],axis=1))

In [39]:

t_train = np.array(features_df['class'])

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
RF = RandomForestClassifier(n_estimators=50, oob_score=True, n_jobs=-1, min_samples_leaf = 1, warm_start = True)

In [57]:
RF.fit(X_train, t_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [58]:
print(np.max(RF.feature_importances_))
RF.feature_importances_
RF.oob_score_
#RF.n_features_

0.048733553605


0.89338950097213221

In [339]:
# Note that in the test set, the id's are mislabeled as 'class'
X_test = np.array(features_test_df.drop(['class'],axis=1))

In [343]:
test_ids = features_test_df['class']

In [340]:
pred = RF.predict(X_test)

In [355]:
out_df=pd.DataFrame(pred,columns=['Prediction'])

In [357]:
out_df['Id']=test_ids

In [361]:
out_df = out_df.set_index('Id')

In [350]:
# out_df = pd.DataFrame(test_ids, columns=['Id'])
# out_df['Prediction'] = pred
# out_df = out_df.set_index('Id')

In [362]:
out_df.to_csv('../outputs/RF_prediction_Junge1.csv')

In [376]:
## I submitted this and our score went down...