In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
from scipy import sparse
import util

In [2]:
TRAIN_DIR = "../data/train"

In [3]:
call_set = set([])

In [4]:
def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

In [5]:
#creating a set of features counting the number of tags
def call_feats(tree, good_calls):
    #Inputs
    #tree - tree object for every file
    #good_calls - list of tags for which we create the features
    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]
    return call_feat_array

In [6]:
###Creating function for loading data
def create_matrix(start_index, end_index, tags, direc="../data/train"):
    X = None
    classes = []
    ids = []
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue
            
        i += 1
        if i < start_index:
            continue
        if i >= end_index:
            break
        id_str, clas = datafile.split('.')[:2]
        ids.append(id_str)
        #adding target class to training data
        try:
            classes.append(util.malware_classes.index(clas))
        except ValueError:
            assert clas == "X"
            classes.append(-1)
            
        #parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree, tags)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))
            
    return X, np.array(classes), ids    

#### Data Exploration

In [13]:
#Data exploration
#Loading the files
fileList = os.listdir(TRAIN_DIR)
numFiles = len(fileList)

In [19]:
#List of unique tags
tags = []
for idx in range(numFiles):
    tree = ET.parse(os.path.join(TRAIN_DIR,fileList[idx]))
    for el in tree.iter():
        call = el.tag
        tags.append(call)
    tags = list(np.unique(tags))
unique_tags = np.unique(tags)

In [20]:
#Converting all tags to 'str' from  numpy.string_
unique_tags = [str(tag) for tag in unique_tags]

In [83]:
X_train, t_train, train_ids = create_matrix(0, numFiles,\
                                            unique_tags, TRAIN_DIR)

In [22]:
features_df = pd.DataFrame(X_train,columns=unique_tags)
features_df['class'] = t_train
features_df['id'] = train_ids
#Saving the features dataframe as a new file
features_df.to_csv('../outputs/features_v1.csv')

#### Cleaning and transforming the test data

In [126]:
TEST_DIR = "../data/test"
testFileList = os.listdir(TEST_DIR)
numTestFiles = len(testFileList)
X_test_real, t_test, test_ids = create_matrix(0, numTestFiles,\
                                            unique_tags, TEST_DIR)

In [125]:
#Ignoring t_train since there is no response variable 
features_test_df = pd.DataFrame(X_test,columns=unique_tags)
features_test_df['class'] = test_ids

In [25]:
features_test_df.head()

Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,connect,...,thread,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value,class
0,0,0,4,0,0,1,1,0,0,0,...,4,0,0,0,0,35,0,0,0,0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f
1,0,0,5,1,0,1,0,0,0,0,...,5,0,0,7,0,254,0,4,0,001f298a534ae4b0db7f2707169250aa215c3b5f2
2,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,001f5fdaaa8bbe20303527198d09a30bb7ca3eb50
3,0,0,5,4,0,1,0,0,0,0,...,5,12,0,0,0,71,0,0,0,002ca2c41b649f85c05ae30013436781a932fecc6
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,003e109543b4ea22d2bcc1ec309bf2fd34e9a1a1d


In [26]:
features_test_df.to_csv('../outputs/features_test_v1.csv')

#### Random forest classifiers

In [127]:
from sklearn.ensemble import RandomForestClassifier

In [128]:
RF = RandomForestClassifier(n_estimators=40, oob_score=True, n_jobs=-1, 
                            min_samples_leaf = 1, warm_start = False, class_weight = 'balanced_subsample')

In [129]:
RF.fit(X_train1, Y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [118]:
print(np.max(RF.feature_importances_))
RF.feature_importances_
RF.oob_score_
#RF.n_features_

0.0616244584318


0.87936622254231189

In [119]:
pred = RF.predict(X_test)

In [120]:
print(accuracy_score(Y_test, pred))

0.873786407767


In [136]:
out_df = pd.DataFrame(test_ids, columns=['Id'])
out_df['Prediction'] = LR_pred
out_df = out_df.set_index('Id')

In [137]:
out_df.to_csv('../outputs/RF_prediction_V_LR.csv')

### Logistic Regression

In [130]:
from sklearn.cross_validation import train_test_split
from harness import RMSE, train_test
from sklearn.metrics import accuracy_score

In [131]:
X_train1, X_test_train, Y_train, Y_test = train_test_split(X_train, t_train, test_size=0.10, random_state=1)

In [132]:
from sklearn.linear_model import LogisticRegression

In [133]:
LR = LogisticRegression(penalty='l1', multi_class='ovr', 
                        n_jobs = -1, warm_start = True, solver = 'liblinear') 
LR.fit(X_train1, Y_train)
LR_pred = LR.predict(X_test_train)

In [92]:
LR.coef_ 

array([[ 0.        , -0.13040704,  0.00469179, ...,  0.1231976 ,
         0.0188992 ,  0.0110911 ],
       [ 0.        ,  0.        , -0.0093725 , ...,  0.        ,
         0.08925144, -0.00697102],
       [ 0.        ,  0.        , -0.80817745, ...,  0.        ,
        -2.2747892 , -0.57638944],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -0.05801849, -0.25976336],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.06828763, -0.73372729],
       [ 0.        ,  0.        , -0.31271708, ...,  0.        ,
        -0.20380483,  0.        ]])

In [93]:
LR.intercept_

array([-2.72103557, -4.37856669, -3.98507669, -4.11493566, -5.07022979,
       -2.1809449 , -3.6603954 , -2.20111445, -2.20274042, -1.24999822,
       -5.59980941,  0.        , -2.97100373, -0.64484034, -0.61372374])

In [134]:
print(accuracy_score(Y_test, LR_pred))


0.844660194175


LR = LogisticRegression(penalty='l1', multi_class='ovr', 
                        n_jobs = -1, warm_start = True, solver = 'liblinear')    

0.844660194175    

LR = LogisticRegression(penalty='l1', class_weight='balanced', multi_class='ovr', 
                        n_jobs = -1, warm_start = True, solver = 'liblinear')  
0.799352750809
LR = LogisticRegression(penalty='l2', class_weight='balanced', multi_class='ovr', 
                        n_jobs = -1, warm_start = True, solver = 'liblinear')    
0.637540453074   