In [1]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

In [2]:
import numpy as np
import pandas as pd
import random
import os
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import autokeras as ak
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, ReLU, Dropout
from tensorflow.keras.models import Model

In [3]:
#declear path to your data
drebin_data_path = 'data\drebin.csv'
columns = list(pd.read_csv('data\dataset-features-categories.csv', header = None).iloc[:,0])
# Importing the dataset
Drebin_data = pd.read_csv(drebin_data_path, names = columns)

X = Drebin_data.iloc[:,range(0,Drebin_data.shape[1]-1)].values
y = Drebin_data.iloc[:, -1].values

lb = LabelBinarizer()
y = lb.fit_transform(y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Train-Test

In [4]:
classifiers = {"SVM": SVC(kernel = 'linear', degree=3), "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred)
    print('f1', f1_score)
    
print("---------------------------")

---------------------------
SVM
Confusion Matrix [[2851   47]
 [  45 1568]]
accuracy 0.9796054090002216
precision 0.9796054090002216
recall 0.972101673899566
f1 0.9714993804213135
---------------------------
MLP
Confusion Matrix [[2880   18]
 [  29 1584]]
accuracy 0.9895810241631567
precision 0.9895810241631567
recall 0.9820210787352759
f1 0.9853810264385693
---------------------------
XGB
Confusion Matrix [[2882   16]
 [  27 1586]]
accuracy 0.9904677455109732
precision 0.9904677455109732
recall 0.9832610043397396
f1 0.9866251944012442
---------------------------
LGBM
Confusion Matrix [[2884   14]
 [  29 1584]]
accuracy 0.9904677455109732
precision 0.9904677455109732
recall 0.9820210787352759
f1 0.9866085331672375
---------------------------


# AutoKeras Model

In [5]:
saved_model=keras.models.load_model('structured_data_classifier\best_model', compile=True)
print(saved_model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 215)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 215)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 215)              431       
 n)                                                              
                                                                 
 dense (Dense)               (None, 256)               55296     
                                                                 
 re_lu (ReLU)                (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                8224  

In [6]:
def autoKerasModel(path_best_model):
    saved_model = keras.models.load_model(path_best_model, compile=True)
    input_layer = Input(shape=(215,))
    x = saved_model.layers[1](input_layer)
    x = saved_model.layers[2](x)
    x = Dense(units=256)(x)
    x = ReLU()(x)
    x = Dense(units=32)(x)
    x = ReLU()(x)
    x = Dense(units=1)(x)
    x = saved_model.layers[-1](x)
    new_model = Model(inputs=input_layer, outputs=x)
    return new_model

In [7]:
autokeras_model=autoKerasModel('structured_data_classifier/best_model')
autokeras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

autokeras_model.fit(X_train, y_train, epochs=15)
results = autokeras_model.evaluate(X_test, y_test)

print("test loss, test acc:", results)
print(autokeras_model.summary())

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss, test acc: [0.08261679857969284, 0.9884726405143738]
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 215)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 215)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 215)              431       
 n)                                                              
                                                                 
 dense (Dense)               (None, 256)               55296     
                         

# Ensemble Learning - Majority Voting

In [8]:
def majority_voting(classifiers, autokeras_model):
    y_preds = np.ndarray(shape=(5,len(y_test)))
    i=0
    for classifier_pair in classifiers.items():
        classifier = classifier_pair[1]
        # Predicting the Test set results
        y_preds[i] = classifier.predict(X_test)
        i += 1
    y_preds[i] = np.transpose(autokeras_model.predict(X_test))
    y_preds = np.transpose(y_preds)
    y_pred = []
    for preds in y_preds:
        if sum(preds) >= 3:
            y_pred.append(1)
        else:
            y_pred.append(0)
    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)
    return accuracy

In [9]:
majority_voting(classifiers, autokeras_model)

accuracy 0.9906894258479273


0.9906894258479273