In [30]:
import numpy as np
from sklearn import svm
from sklearn import datasets
from sklearn import metrics
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import NuSVC

import time

import pandas as pd
import os


def read_from_csv(s):
    read_path = './DataSets/' + s
    data_path = os.path.join(read_path)
    df = pd.read_csv(data_path)
    df = df.fillna(0)
    return df


def transfer_to_TrainArray(df):
    length = df.shape[0]
    width = df.shape[1]
    # trainSet contains all data except the class
    train_set = np.zeros([length, width - 1])
    for i in range(length):
        train_set[i] = df.iloc[i, :-1].to_numpy()
    return train_set


def checkInf(df):
    print(df.shape)
    length = df.shape[0]
    for i in range(length):
        currentArray = df.iloc[i, :-1].to_numpy()
        arrayLength = len(currentArray)
        for element in range(arrayLength):
            currElement = currentArray[element]
            if currElement == float('inf'):
                df = df.drop([i])
                #print(i)
    print(df.shape)
    return df


def transfer_to_TestArray(df):
    length = df.shape[0]
    # trainSet contains all data except the class
    test_set = np.zeros(length)
    for i in range(length):
        if df.iloc[i, -1] == 'benign':
            test_set[i] = 1
        else:
            test_set[i] = 0
    return test_set

def fittime_precision_recall(classifier, trainx, trainy, xtest, ytest):
    start = time.time()
    classifier.fit(trainx, trainy)
    end = time.time()
    precision = precision_score(ytest, classifier.predict(xtest), pos_label = 1)
    miss_rate = 1-recall_score(ytest, classifier.predict(xtest), pos_label = 1)
    return end-start, precision, miss_rate

test_df = read_from_csv('Malware.csv')
test_df = checkInf(test_df)
trainSet = transfer_to_TrainArray(test_df)
testSet = transfer_to_TestArray(test_df)
x_train, x_test, y_train, y_test = \
        model_selection.train_test_split(trainSet, testSet, random_state=1, test_size=0.2)

(14493, 80)
(14492, 80)


In [38]:
adaClassifier = AdaBoostClassifier(n_estimators=200, algorithm = 'SAMME.R')
rfClassifier = RandomForestClassifier(oob_score=True, random_state=100, max_features = 'sqrt')
svcClassifier = make_pipeline(StandardScaler(), SVC(gamma='auto'))
linearsvmClassifier = svm.SVC(kernel='linear', gamma=0.1, decision_function_shape='ovo', C=0.1)
nusvcClassifier = make_pipeline(StandardScaler(), NuSVC(decision_function_shape = 'ovo'))
print(fittime_precision_recall(adaClassifier, x_train, y_train, x_test, y_test))
print(fittime_precision_recall(rfClassifier, x_train, y_train, x_test, y_test))
print(fittime_precision_recall(svcClassifier, x_train, y_train, x_test, y_test))
print(fittime_precision_recall(linearsvmClassifier, x_train, y_train, x_test, y_test))
print(fittime_precision_recall(nusvcClassifier, x_train, y_train, x_test, y_test))

(5.407882213592529, 0.9776643267389917, 0.009696186166774368)
(1.499380350112915, 0.9967616580310881, 0.005171299288946329)
(2.773630380630493, 0.9688295165394402, 0.015513897866838988)
(27.072172164916992, 0.8757539203860072, 0.061409179056237884)
(12.486751556396484, 0.8522792844777842, 0.04524886877828049)
