# Machine Learning Poisonong Attacks in Malware Detection

## Prestart
### Import libraries

In [78]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

### Files
Store the paths for our files in variables

In [79]:
file_arm_attacker_training = "files\\arm_datasets\\arm_attacker_training.csv"
file_arm_attacker_validation = "files\\arm_datasets\\arm_attacker_validation.csv"
file_arm_attacker_test = "files\\arm_datasets\\arm_attacker_test.csv"

file_arm_victim_training = "files\\arm_datasets\\arm_victim_training.csv"
file_arm_victim_validation = "files\\arm_datasets\\arm_victim_validation.csv"
file_arm_victim_test = "files\\arm_datasets\\arm_victim_test.csv"

## Creating the arm datasets
$ 8:1:1 $ - Training:Test:Validation

### Attacker dataset
For the first model - we call it *Clean* - we will train on clean a clean dataset

#### Reading in
In this state the csv contains the filename and the label

In [80]:
arm_attacker_training = pd.read_csv(file_arm_attacker_training, header=None, index_col=False)
arm_attacker_validation = pd.read_csv(file_arm_attacker_validation, header=None, index_col=False)
arm_attacker_test = pd.read_csv(file_arm_attacker_test, header=None, index_col=False)

#### Dataset
Creating the pure dataset, only features

In [81]:
dataset_arm_attacker_training = np.asarray(arm_attacker_training.drop(columns=arm_attacker_training.columns[-2:]))
dataset_arm_attacker_validation = np.asarray(arm_attacker_validation.drop(columns=arm_attacker_validation.columns[-2:]))
dataset_arm_attacker_test = np.asarray(arm_attacker_test.drop(columns=arm_attacker_test.columns[-2:]))

#### Labels for the attacker datasets
Creating the labels for the attacker datasets
*Malware* - $ 0 $, *Benign* - $ 1 $
-**Trainining**: 800 malware, 800 benign
-**Validation**: 100 malware, 100 benign
-**Test**: 100 malware, 100 benign

In [82]:
labels_arm_attacker_training = np.asarray(arm_attacker_training[arm_attacker_training.columns[-1]])
labels_arm_attacker_validation = np.asarray(arm_attacker_validation[arm_attacker_validation.columns[-1]])
labels_arm_attacker_test = np.asarray(arm_attacker_test[arm_attacker_test.columns[-1]])

#### Filenames
Getting the filenames separated just for the fun of it

In [83]:
names_arm_attacker_training = arm_attacker_training[arm_attacker_training.columns[-2]]
names_arm_attacker_validation = arm_attacker_validation[arm_attacker_validation.columns[-2]]
names_arm_attacker_test = arm_attacker_test[arm_attacker_test.columns[-2]]

In [None]:
names_arm_attacker_training.to_csv("files\\filenames\\arm_attacker")

### Victim dataset
For the second model - we call it *Victim* - we will train on clean a poisoned dataset

#### Reading in
In this state the csv contains the filename and the label

In [84]:
arm_victim_training = pd.read_csv(file_arm_victim_training, header=None, index_col=False)
arm_victim_validation = pd.read_csv(file_arm_victim_validation, header=None, index_col=False)
arm_victim_test = pd.read_csv(file_arm_victim_test, header=None, index_col=False)

#### Dataset
Creating the pure dataset, only features

In [85]:
dataset_arm_victim_training = np.asarray(arm_victim_training.drop(columns=arm_victim_training.columns[-2:]))
dataset_arm_victim_validation = np.asarray(arm_victim_validation.drop(columns=arm_victim_validation.columns[-2:]))
dataset_arm_victim_test = np.asarray(arm_victim_test.drop(columns=arm_victim_test.columns[-2:]))

#### Labels for the victim datasets
Creating the labels for the victim datasets
*Malware* - $ 0 $, *Benign* - $ 1 $
-**Trainining**: 800 malware, 800 benign
-**Validation**: 100 malware, 100 benign
-**Test**: 100 malware, 100 benign

In [86]:
labels_arm_victim_training = np.asarray(arm_victim_training[arm_victim_training.columns[-1]])
labels_arm_victim_validation = np.asarray(arm_victim_validation[arm_victim_validation.columns[-1]])
labels_arm_victim_test = np.asarray(arm_victim_test[arm_victim_test.columns[-1]])

#### Filenames
Getting the filenames separated just for the fun of it

In [87]:
names_arm_victim_training = arm_victim_training[arm_victim_training.columns[-2]]
names_arm_victim_validation = arm_victim_validation[arm_victim_validation.columns[-2]]
names_arm_victim_test = arm_victim_test[arm_victim_test.columns[-2]]

## Models for arm Datasets
In this section we will build the models with the exact same struture
Neural network with one *hidden layer* with a $sigmoid$ *activation function*

### Model for Attacker Data
Building, fitting and evaluating the Model with the *attacker* datasets

In [88]:
model_attacker = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
model_attacker.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()])
model_attacker.fit(dataset_arm_attacker_training, labels_arm_attacker_training, epochs=10,
                validation_data=(dataset_arm_attacker_validation, labels_arm_attacker_validation))

model_attacker.evaluate(dataset_arm_attacker_test, labels_arm_attacker_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.1481858491897583, 0.9549999833106995]

### Model for Victim Data
Building, fitting and evaluating the Model with the *victim* datasets

In [89]:
model_victim = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
model_victim.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()])
model_victim.fit(dataset_arm_victim_training, labels_arm_victim_training, epochs=10,
                validation_data=(dataset_arm_victim_validation, labels_arm_victim_validation))

model_victim.evaluate(dataset_arm_victim_test, labels_arm_victim_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.10751333832740784, 0.9750000238418579]

## POISON IT

### Files

In [90]:
file_poison_arm_BM = "files\\poison_data\\poisoned_benign_malware.csv"
file_poison_arm_MB = "files\\poison_data\\poisoned_malware_benign.csv"

### Poisoned Datasets

#### Attacker dataset

In [91]:
poisoned_arm_attacker_training = pd.read_csv(file_poison_arm_BM, index_col=False, header=None)
poisoned_arm_attacker_training = arm_attacker_training.append(poisoned_arm_attacker_training, ignore_index=True).sample(frac=1)
dataset_poisoned_arm_attacker_training = np.asarray(poisoned_arm_attacker_training.drop(columns=poisoned_arm_attacker_training.columns[-2:]))
labels_poisoned_arm_attacker_training = np.asarray(poisoned_arm_attacker_training[poisoned_arm_attacker_training.columns[-1]])

  poisoned_arm_attacker_training = arm_attacker_training.append(poisoned_arm_attacker_training, ignore_index=True).sample(frac=1)


### Models

#### Attacker model

In [92]:
model_attacker = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
model_attacker.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()])
model_attacker.fit(dataset_poisoned_arm_attacker_training, labels_poisoned_arm_attacker_training, epochs=10,
                validation_data=(dataset_arm_attacker_validation, labels_arm_attacker_validation))

model_attacker.evaluate(dataset_arm_attacker_test, labels_arm_attacker_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.14977072179317474, 0.949999988079071]