In [2]:
import numpy as np
import pandas as pd
import random
import os
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import autokeras as ak
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [3]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

# Binary Classification

In [4]:
#declear path to your data
krono_data_path1 = 'data/kronodroid.csv'
# Importing the dataset
Krono_data = pd.read_csv(krono_data_path1)
Krono_data = Krono_data.sample(frac = 1)
X = Krono_data.iloc[:,range(1,Krono_data.shape[1]-1)].values
y = Krono_data.iloc[:, -1].values

lb = LabelBinarizer()
y = lb.fit_transform(y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [9]:
# "GaussianNB": GaussianNB(), "DT": DecisionTreeClassifier(), "MLP": MLPClassifier(random_state=1, max_iter=300),
# 47.9%, 65.8%, 79%,  
classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print(y_pred)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred)
    print('f1', f1_score)
    
print("---------------------------")

---------------------------
RFC
[0 1 0 ... 1 0 1]
Confusion Matrix [[10744   303]
 [  811 11584]]
accuracy 0.9524784574694992
precision 0.9524784574694992
recall 0.9345703912868092
f1 0.9541223951898525
---------------------------
SVM
[0 1 0 ... 1 0 0]
Confusion Matrix [[10875   172]
 [ 1959 10436]]
accuracy 0.9090947871342036
precision 0.9090947871342036
recall 0.8419524001613554
f1 0.9073599095770117
---------------------------
SGD
[0 1 0 ... 1 0 1]
Confusion Matrix [[ 9966  1081]
 [  717 11678]]
accuracy 0.9233000597218667
precision 0.9233000597218667
recall 0.9421540943929003
f1 0.9285203148604596
---------------------------
XGB
[0 1 0 ... 1 0 1]
Confusion Matrix [[10365   682]
 [  474 11921]]
accuracy 0.9506868014674515
precision 0.9506868014674515
recall 0.9617587736990723
f1 0.9537563005040404
---------------------------
LGBM
[0 1 0 ... 1 0 1]
Confusion Matrix [[ 9895  1152]
 [  369 12026]]
accuracy 0.9351164576401331
precision 0.9351164576401331
recall 0.9702299314239613
f1 0.9

# Multiclass

In [5]:
#declear path to your data
krono_data_path2 = 'data/real_legitimate_v1.csv'
krono_data_path3 = 'data/real_malware_v1.csv'

# Importing the dataset
dfs = []
Krono_data_ori_2 = pd.read_csv(krono_data_path2)
benign_labels = ['Benign']*len(Krono_data_ori_2)
Krono_data_ori_2['MalFamily'] = benign_labels
Krono_data_ori_3 = pd.read_csv(krono_data_path3)
mal_labels = set(Krono_data_ori_3['MalFamily'])

dfs.append(Krono_data_ori_2)
dfs.append(Krono_data_ori_3)
Krono_data_ori = pd.concat(dfs, ignore_index=True)

diff = []
for c in Krono_data_ori.columns:
    if c not in Krono_data.columns:
        diff.append(c)
diff.remove('MalFamily')
Krono_data_ori = Krono_data_ori.drop(diff, axis='columns')
        
X_ori = Krono_data_ori.iloc[:,range(1,Krono_data_ori.shape[1]-1)].values
y_ori = Krono_data_ori.iloc[:, -1].values
    

for i in range(len(y_ori)):
    if pd.isnull(y_ori[i]):
        y_ori[i] = 'Unknown'

y_ori = LabelBinarizer().fit_transform(y_ori)

# Splitting the dataset into the Training set and Test set
X_train_ori, X_test_ori, y_train_ori, y_test_ori = train_test_split(X_ori, y_ori.astype(int), test_size = 0.3, random_state = 0)

# Feature Scaling
X_train_ori = StandardScaler().fit_transform(X_train_ori)
X_test_ori = StandardScaler().fit_transform(X_test_ori)

In [6]:
classifiers_ori = {"RFC": RandomForestClassifier(), "DT": DecisionTreeClassifier(), "MLP": MLPClassifier(random_state=1, max_iter=300), "XGB": XGBClassifier()} 

for classifier_pair in classifiers_ori.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train_ori, y_train_ori)
    
    # Predicting the Test set results
    y_pred_ori = classifier.predict(X_test_ori)

    #compute accuracy_score
    accuracy = acc(y_test_ori, y_pred_ori)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test_ori, y_pred_ori, average='micro')
    print('precision', precision_score)

    """#compute recall score
    recall_score = recall(y_test_ori, y_pred_ori)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test_ori, y_pred_ori)
    print('f1', f1_score)"""
    
print("---------------------------")

---------------------------
RFC
accuracy 0.6621875266615477
precision 0.9599876314162028
---------------------------
DT
accuracy 0.4122941728521457
precision 0.41386545625829657
---------------------------
MLP
accuracy 0.5399283337599181
precision 0.643802966101695
---------------------------
XGB
accuracy 0.7605153143929699
precision 0.9305390459455269
---------------------------


In [10]:
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    overwrite=True, max_trials=15
)  # It tries different models.
# Feed the structured data classifier with training data.
clf.fit(
    x=X_train,
    y=y_train,
    epochs=1,
)

# Evaluate the best model with testing data.
print(clf.evaluate(x=X_test, y=y_test))

Trial 15 Complete [00h 08m 28s]
val_accuracy: 0.8348749876022339

Best val_accuracy So Far: 0.9527429342269897
Total elapsed time: 02h 36m 28s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: .\structured_data_classifier\best_model\assets
[321648.875, 0.528751790523529]


In [11]:
model = clf.export_model()
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 462)]             0         
                                                                 
 multi_category_encoding (Mu  (None, 462)              0         
 ltiCategoryEncoding)                                            
                                                                 
 normalization (Normalizatio  (None, 462)              925       
 n)                                                              
                                                                 
 dense (Dense)               (None, 32)                14816     
                                                                 
 re_lu (ReLU)                (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 512)               16896 

In [13]:
results = model.evaluate(X_test, y_test)
print("test loss, test acc:", results)

test loss, test acc: [321648.875, 0.528751790523529]


## Poisoning

In [9]:
from secml.ml.classifiers.sklearn.c_classifier_logistic import CClassifierLogistic
from secml.adv.attacks.poisoning.c_attack_poisoning_logistic_regression import CAttackPoisoningLogisticRegression
from secml.data.c_dataset import CDataset
from secml.ml.peval.metrics import CMetricAccuracy

lb, ub = 0., 1.  # Bounds of the attack space. Can be set to `None` for unbounded
n_poisoning_points = 15  # Number of poisoning points to generate

# Should be chosen depending on the optimization problem
solver_params = {
    'eta': 0.25,
    'eta_min': 2.0,
    'eta_max': None,
    'max_iter': 100,
    'eps': 1e-6
}

dataset = CDataset(X_train, y_train)
metric = CMetricAccuracy()

# train SVM in the dual space, on a linear kernel, as needed for poisoning
c_classifier_LR = CClassifierLogistic()



In [10]:
print("Training of c_classifier_LR...")
c_classifier_LR.fit(X_train, y_train)

# Predicting the Test set results
y_pred = c_classifier_LR.predict(X_test)
print(y_pred)


pois_attack = CAttackPoisoningLogisticRegression(classifier=c_classifier_LR,
                                training_data=dataset,
                                val=dataset)
pois_attack.n_points = n_poisoning_points

# Run the poisoning attack
print("Attack started...")
pois_y_pred, _, pois_points_ds, _ = pois_attack.run(X_test, y_test)
print("Attack complete!")

# Evaluate the accuracy of the original classifier
acc = metric.performance_score(y_true=y_test, y_pred=c_classifier_LR.predict(X_test))
# Evaluate the accuracy after the poisoning attack
pois_acc = metric.performance_score(y_true=y_test, y_pred=pois_y_pred)

print("Original accuracy on test set: {:.2%}".format(acc))
print("Accuracy after attack on test set: {:.2%}".format(pois_acc))

print("\n\n---------------------------")
print("END OF LR POISONING")
print("---------------------------\n\n")

Training of c_classifier_SVM...
CArray([1 1 0 ... 1 1 1])


TypeError: 'NoneType' object is not subscriptable

## Training with Poisoned Data

from secml.ml.classifiers.sklearn.c_classifier_sklearn import CClassifierSkLearn
from secml.adv.attacks import CAttackPoisoningSVM
from secml.data.c_dataset import CDataset
from secml.ml.peval.metrics import CMetricAccuracy

lb, ub = 0., 1.  # Bounds of the attack space. Can be set to `None` for unbounded
n_poisoning_points = 15  # Number of poisoning points to generate

# Should be chosen depending on the optimization problem
solver_params = {
    'eta': 0.25,
    'eta_min': 2.0,
    'eta_max': None,
    'max_iter': 100,
    'eps': 1e-6
}

classifiers = {"RFC": RandomForestClassifier(), "SVM": SVC(kernel = 'poly', degree=3), "SGD": SGDClassifier(), "XGB": XGBClassifier(), "LGBM": LGBMClassifier()} 

dataset = CDataset(X_train, y_train)

metric = CMetricAccuracy()

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    print(y_pred)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred)
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred)
    print('f1', f1_score)

    print("---------------------------")
    print("POISONING")
    print("---------------------------")

    classifier = CClassifierSkLearn(classifier)

    pois_attack = CAttackPoisoningSVM(classifier=classifier,
                                  training_data=dataset,
                                  val=dataset,
                                  lb=lb, ub=ub,
                                  solver_params=solver_params)
    pois_attack.n_points = n_poisoning_points

    # Run the poisoning attack
    print("Attack started...")
    pois_y_pred, _, pois_points_ds, _ = pois_attack.run(X_test, y_test)
    print("Attack complete!")

    # Evaluate the accuracy of the original classifier
    acc = metric.performance_score(y_true=y_test, y_pred=classifier.predict(X_test))
    # Evaluate the accuracy after the poisoning attack
    pois_acc = metric.performance_score(y_true=y_test, y_pred=pois_y_pred)

    print("Original accuracy on test set: {:.2%}".format(acc))
    print("Accuracy after attack on test set: {:.2%}".format(pois_acc))

    print("\n\n---------------------------")
    print("END OF MODEL")
    print("\n\n---------------------------")


print("---------------------------")