# Machine Learning Poisonong Attacks in Malware Detection

## Prestart
### Import libraries

In [22]:
import csv

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers


import filenames

## Creating the arm dataset
$ 8:1:1 $ - Training:Test:Validation

#### Reading in
In this state the csv contains the filename and the label

In [23]:
arm_training = pd.read_csv(filenames.arm_training, header=None, index_col=False)
arm_validation = pd.read_csv(filenames.arm_validation, header=None, index_col=False)
arm_test = pd.read_csv(filenames.arm_test, header=None, index_col=False)

#### Dataset
Creating the pure dataset, only features

In [28]:
dataset_arm_training = np.asarray(arm_training.drop(columns=arm_training.columns[-2:]))
dataset_arm_validation = np.asarray(arm_validation.drop(columns=arm_validation.columns[-2:]))
dataset_arm_test = np.asarray(arm_test.drop(columns=arm_test.columns[-2:]))

[[ 99  15   9 ...   1   1   2]
 [ 18   7   3 ...   2   2   3]
 [227   1  10 ...   1   0   0]
 ...
 [ 97   0  13 ...   0   3   2]
 [210  14   0 ...   0   2   2]
 [244   3   9 ...   0   0   1]]
[[0.9636206  0.14600312 0.08760187 ... 0.00973354 0.00973354 0.01946708]
 [0.62592133 0.24341385 0.10432022 ... 0.06954681 0.06954681 0.10432022]
 [0.99472465 0.00438205 0.04382047 ... 0.00438205 0.         0.        ]
 ...
 [0.97014553 0.         0.1300195  ... 0.         0.0300045  0.020003  ]
 [0.99293277 0.06619552 0.         ... 0.         0.0094565  0.0094565 ]
 [0.99551212 0.0122399  0.03671971 ... 0.         0.         0.00407997]]


#### Labels for the datasets
Creating the labels for the datasets
*Malware* - $ 0 $, *Benign* - $ 1 $
-**Trainining**: 800 malware, 800 benign
-**Validation**: 100 malware, 100 benign
-**Test**: 100 malware, 100 benign

In [None]:
labels_arm_training = np.asarray(arm_training[arm_training.columns[-1]])
labels_arm_validation = np.asarray(arm_validation[arm_validation.columns[-1]])
labels_arm_test = np.asarray(arm_test[arm_test.columns[-1]])

#### Filenames
Getting the filenames separated just for the fun of it

In [None]:
names_arm_training = arm_training[arm_training.columns[-2]]
names_arm_validation = arm_validation[arm_validation.columns[-2]]
names_arm_test = arm_test[arm_test.columns[-2]]

#### Generate

## Models for arm Datasets
In this section we will build the models with the exact same struture
Neural network with one *hidden layer* with a $sigmoid$ *activation function*

### Model
Building, fitting and evaluating the Model with the datasets

In [None]:
base_model = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
base_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()],
                   run_eagerly=True)
base_model.fit(dataset_arm_training, labels_arm_training, epochs=10,
                validation_data=(dataset_arm_validation, labels_arm_validation))
#
# weights = base_model.get_weights()
# print("weights: ", weights)

[_,binary_accuracy_base] = base_model.evaluate(dataset_arm_test, labels_arm_test)
print(binary_accuracy_base)
base_model.save(filenames.base_model)

In [None]:
df_arm_malware_forpoison = pd.read_csv(filenames.forpoison_arm_malware, header=None, index_col=False)
df_arm_malware_forpoison = df_arm_malware_forpoison.drop(columns=df_arm_malware_forpoison.columns[-2:])
topredict = np.asarray([df_arm_malware_forpoison.iloc[0],])
print(topredict)

In [None]:
[[predict_base]] = base_model.predict(topredict)
print(predict_base)

## POISON IT

### Files

In [None]:
file_poison_arm_BM = "files\\poison_data\\iterative\\poisoned_benign_malware_1000.csv"
#file_poison_arm_MB = "files\\poison_data\\poisoned_malware_benign.csv"

### Poisoned Dataset

In [None]:
poisoned_arm_training = pd.read_csv(file_poison_arm_BM, index_col=False, header=None)
poisoned_arm_training_base = poisoned_arm_training.sample(frac=0.05)
poisoned_arm_training_new = arm_training.append(poisoned_arm_training, ignore_index=True).sample(frac=1)
dataset_poisoned_arm_training_base = np.asarray(poisoned_arm_training_base.drop(columns=poisoned_arm_training_base.columns[-2:]))
dataset_poisoned_arm_training_new = np.asarray(poisoned_arm_training_new.drop(columns=poisoned_arm_training_new.columns[-2:]))
labels_poisoned_arm_training_base = np.asarray(poisoned_arm_training_base[poisoned_arm_training_base.columns[-1]])
labels_poisoned_arm_training_new = np.asarray(poisoned_arm_training_new[poisoned_arm_training_new.columns[-1]])

### Model

#### Append model
Base model trained with the poisoned data

In [None]:
base_model = keras.models.load_model(filenames.base_model)
base_model.fit(dataset_poisoned_arm_training_base, labels_poisoned_arm_training_base, epochs=10,
                validation_data=(dataset_arm_validation, labels_arm_validation))
[_, binary_accuracy_appended] = base_model.evaluate(dataset_arm_test, labels_arm_test)
print(binary_accuracy_appended)
base_model.save(filenames.modified_model)

In [None]:
[[predict_appended]] = base_model.predict(topredict)
print(predict_appended)

#### New model
Newly trained model, with all training data + poisoned data

In [None]:
poison_model = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
poison_model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()])
poison_model.fit(dataset_poisoned_arm_training_new, labels_poisoned_arm_training_new, epochs=10,
                validation_data=(dataset_arm_validation, labels_arm_validation))
[_, binary_accuracy_new] = poison_model.evaluate(dataset_arm_test, labels_arm_test)
print(binary_accuracy_appended)
poison_model.save(filenames.poison_model)

In [None]:
[[predict_new]] = poison_model.predict(topredict)
print(predict_new)

In [None]:
results = [file_poison_arm_BM.split("_")[-1].split(".")[0],
           binary_accuracy_base,
           predict_base,
           binary_accuracy_appended,
           predict_appended,
           binary_accuracy_new,
           predict_new]
print(results)
with open(filenames.results, "a") as f:
    csv_writer = csv.writer(f, lineterminator="\n")
    csv_writer.writerow(results)