# Machine Learning Poisonong Attacks in Malware Detection

## Prestart
### Import libraries

In [1]:
import csv

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

### Files
Store the paths for our files in variables

In [2]:
file_arm_training = "files\\arm_datasets\\arm_training.csv"
file_arm_validation = "files\\arm_datasets\\arm_validation.csv"
file_arm_test = "files\\arm_datasets\\arm_test.csv"

## Creating the arm dataset
$ 8:1:1 $ - Training:Test:Validation

#### Reading in
In this state the csv contains the filename and the label

In [3]:
arm_training = pd.read_csv(file_arm_training, header=None, index_col=False)
arm_validation = pd.read_csv(file_arm_validation, header=None, index_col=False)
arm_test = pd.read_csv(file_arm_test, header=None, index_col=False)

#### Dataset
Creating the pure dataset, only features

In [4]:
dataset_arm_training = np.asarray(arm_training.drop(columns=arm_training.columns[-2:]))
dataset_arm_validation = np.asarray(arm_validation.drop(columns=arm_validation.columns[-2:]))
dataset_arm_test = np.asarray(arm_test.drop(columns=arm_test.columns[-2:]))

#### Labels for the datasets
Creating the labels for the datasets
*Malware* - $ 0 $, *Benign* - $ 1 $
-**Trainining**: 800 malware, 800 benign
-**Validation**: 100 malware, 100 benign
-**Test**: 100 malware, 100 benign

In [5]:
labels_arm_training = np.asarray(arm_training[arm_training.columns[-1]])
labels_arm_validation = np.asarray(arm_validation[arm_validation.columns[-1]])
labels_arm_test = np.asarray(arm_test[arm_test.columns[-1]])

#### Filenames
Getting the filenames separated just for the fun of it

In [6]:
names_arm_training = arm_training[arm_training.columns[-2]]
names_arm_validation = arm_validation[arm_validation.columns[-2]]
names_arm_test = arm_test[arm_test.columns[-2]]

#### Generate

## Models for arm Datasets
In this section we will build the models with the exact same struture
Neural network with one *hidden layer* with a $sigmoid$ *activation function*

### Model
Building, fitting and evaluating the Model with the datasets

In [7]:
base_model = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
base_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()],
                   run_eagerly=True)
base_model.fit(dataset_arm_training, labels_arm_training, epochs=10,
                validation_data=(dataset_arm_validation, labels_arm_validation))

weights = base_model.get_weights()
print("weights: ", weights)

[_,binary_accuracy_base] = base_model.evaluate(dataset_arm_test, labels_arm_test)
print(binary_accuracy_base)
base_model.save("models\\base_model")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
weights:  [array([[-3.15496465e-03],
       [ 7.62875611e-03],
       [ 2.06429767e-03],
       [ 2.29038700e-01],
       [ 2.48572100e-02],
       [ 3.53671730e-01],
       [ 2.89823383e-01],
       [ 1.09943762e-01],
       [ 6.26035482e-02],
       [ 4.04426688e-03],
       [-1.71043471e-01],
       [-1.99176356e-01],
       [ 2.12940965e-02],
       [ 8.00802186e-02],
       [ 2.38971293e-01],
       [-6.88804910e-02],
       [-1.06205516e-01],
       [-1.32855132e-01],
       [ 9.19404104e-02],
       [ 2.83191085e-01],
       [ 1.62535578e-01],
       [ 5.08078396e-01],
       [ 3.92034985e-02],
       [-5.08300848e-02],
       [-2.04617918e-01],
       [ 1.13875084e-01],
       [-3.18594456e-01],
       [ 1.51515201e-01],
       [-4.42191780e-01],
       [-4.98126782e-02],
       [-1.21764362e-01],
       [ 3.71846586e-01],
       [-9.81989279e-02],
       [-8.77242014e

In [8]:
df_arm_malware_forpoison = pd.read_csv("files\\poison_data\\arm_malware_forpoison.csv", header=None, index_col=False)
df_arm_malware_forpoison = df_arm_malware_forpoison.drop(columns=df_arm_malware_forpoison.columns[-2:])
topredict = np.asarray([df_arm_malware_forpoison.iloc[0],])
print(topredict)

[[210  14   1   3   1   0   0   1   1   2   0   2   2   2   2   3   3   2
    1   1   1   1   2   3   0   0   2   0   3   0   0   2   0   2   0   0
    3   1   2   3   2   3   3   0   3   3   2   3   1   2   2   3   0   1
    3   0   3   2   3   1   3   1   3   3   1   1   3   2   3   1   1   2
    3   0   1   3   1   1   3   2   2   1   2   0   2   0   2   2   0   2
    2   1   2   0   0   1   0   0   0   1   3   3   2   2   1   1   2   3
    0   2   2   1   0   0   0   3   1   0   0   0   0   2   3   1   1   0
    3   1   1   0   3]]


In [9]:
[[predict_base]] = base_model.predict(topredict)
print(predict_base)

0.19168939


## POISON IT

### Files

In [10]:
file_poison_arm_BM = "files\\poison_data\\iterative\\poisoned_benign_malware_1000.csv"
#file_poison_arm_MB = "files\\poison_data\\poisoned_malware_benign.csv"

### Poisoned Dataset

In [11]:
poisoned_arm_training = pd.read_csv(file_poison_arm_BM, index_col=False, header=None)
poisoned_arm_training_base = poisoned_arm_training.sample(frac=0.05)
poisoned_arm_training_new = arm_training.append(poisoned_arm_training, ignore_index=True).sample(frac=1)
dataset_poisoned_arm_training_base = np.asarray(poisoned_arm_training_base.drop(columns=poisoned_arm_training_base.columns[-2:]))
dataset_poisoned_arm_training_new = np.asarray(poisoned_arm_training_new.drop(columns=poisoned_arm_training_new.columns[-2:]))
labels_poisoned_arm_training_base = np.asarray(poisoned_arm_training_base[poisoned_arm_training_base.columns[-1]])
labels_poisoned_arm_training_new = np.asarray(poisoned_arm_training_new[poisoned_arm_training_new.columns[-1]])

  poisoned_arm_training_new = arm_training.append(poisoned_arm_training, ignore_index=True).sample(frac=1)


### Model

#### Append model
Base model trained with the poisoned data

In [12]:
base_model = keras.models.load_model("models\\base_model")
base_model.fit(dataset_poisoned_arm_training_base, labels_poisoned_arm_training_base, epochs=10,
                validation_data=(dataset_arm_validation, labels_arm_validation))
[_, binary_accuracy_appended] = base_model.evaluate(dataset_arm_test, labels_arm_test)
print(binary_accuracy_appended)
base_model.save("models\\base_model_modified")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8444444537162781
INFO:tensorflow:Assets written to: models\base_model_modified\assets


In [13]:
[[predict_appended]] = base_model.predict(topredict)
print(predict_appended)

0.8372544


#### New model
Newly trained model, with all training data + poisoned data

In [14]:
poison_model = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
poison_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()])
poison_model.fit(dataset_poisoned_arm_training_new, labels_poisoned_arm_training_new, epochs=10,
                validation_data=(dataset_arm_validation, labels_arm_validation))
[_, binary_accuracy_new] = poison_model.evaluate(dataset_arm_test, labels_arm_test)
print(binary_accuracy_appended)
poison_model.save("models\\poison_model")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8444444537162781
INFO:tensorflow:Assets written to: models\poison_model\assets


In [15]:
[[predict_new]] = poison_model.predict(topredict)
print(predict_new)

0.23490192


In [16]:
results = [file_poison_arm_BM.split("_")[-1].split(".")[0],
           binary_accuracy_base,
           predict_base,
           binary_accuracy_appended,
           predict_appended,
           binary_accuracy_new,
           predict_new]
print(results)
with open("files\\results\\results.csv", "a") as f:
    csv_writer = csv.writer(f, lineterminator="\n")
    csv_writer.writerow(results)

['1000', 0.9777777791023254, 0.19168939, 0.8444444537162781, 0.8372544, 0.9916666746139526, 0.23490192]
