# Machine Learning Poisonong Attacks in Malware Detection

## Prestart
### Import libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

### Files
Store the paths for our files in variables

In [2]:
file_arm_malware = "files\\int_malware_arm.csv"
file_arm_benign = "files\\int_benign_arm.csv"
file_mips_malware = "files\\int_malware_mips.csv"
file_mips_benign = "files\\int_benign_mips.csv"

### Criterias
When reading the datasets, we will need criterias to separate our datasets into smaller portions

In [3]:
def criteria_clean_training(x):
    return x % 20 >= 10 or x % 20 == 0 or x % 20 == 5


def criteria_poisoned_training(x):
    return x % 20 < 10 or x % 20 == 10 or x % 20 == 15


def criteria_clean_validation(x):
    return x % 20 != 0


def criteria_poisoned_validation(x):
    return x % 20 != 5


def criteria_clean_test(x):
    return x % 20 != 10


def criteria_poisoned_test(x):
    return x % 20 != 15

## Creating the arm datasets
$ 8:1:1 $ - Training:Test:Validation

### Clean dataset
For the first model - we call it *Clean* - we will train on clean a clean dataset

In [4]:
dataset_clean_training_arm_malware = pd.read_csv(
    file_arm_malware,
    skiprows=lambda x: criteria_clean_training(x),
    index_col=False,
    header=None
)

dataset_clean_validation_arm_malware = pd.read_csv(
    file_arm_malware,
    skiprows=lambda x: criteria_clean_validation(x),
    index_col=False,
    header=None
)
dataset_clean_test_arm_malware = pd.read_csv(
    file_arm_malware,
    skiprows=lambda x: criteria_clean_test(x),
    index_col=False,
    header=None
)

dataset_clean_training_arm_benign = pd.read_csv(
    file_arm_benign,
    skiprows=lambda x: criteria_clean_training(x),
    index_col=False,
    header=None
)
dataset_clean_validation_arm_benign = pd.read_csv(
    file_arm_benign,
    skiprows=lambda x: criteria_clean_validation(x),
    index_col=False,
    header=None
)
dataset_clean_test_arm_benign = pd.read_csv(
    file_arm_benign,
    skiprows=lambda x: criteria_clean_test(x),
    index_col=False,
    header=None
)

dataset_clean_training_arm = dataset_clean_training_arm_malware.append(dataset_clean_training_arm_benign,
                                                                       ignore_index=True)
dataset_clean_validation_arm = dataset_clean_validation_arm_malware.append(dataset_clean_validation_arm_benign,
                                                                           ignore_index=True)
dataset_clean_test_arm = dataset_clean_test_arm_malware.append(dataset_clean_test_arm_benign, ignore_index=True)

  dataset_clean_training_arm = dataset_clean_training_arm_malware.append(dataset_clean_training_arm_benign,
  dataset_clean_validation_arm = dataset_clean_validation_arm_malware.append(dataset_clean_validation_arm_benign,
  dataset_clean_test_arm = dataset_clean_test_arm_malware.append(dataset_clean_test_arm_benign, ignore_index=True)


#### Labels for the clean datasets
Creating the labels for the clean datasets
*Malware* - $ 0 $, *Benign* - $ 1 $
-**Trainining**: 800 malware, 800 benign
-**Validation**: 100 malware, 100 benign
-**Test**: 100 malware, 100 benign

In [5]:
labels_clean_training_arm = ([0] * 800) + ([1] * 800)
labels_clean_validation_arm = ([0] * 100) + ([1] * 100)
labels_clean_test_arm = ([0] * 100) + ([1] * 100)

#### Shuffle the *clean* dataframe
First we append the labels to the Dataframe, then shuffle the whole Dataframe and finally separate the labels from the data

In [6]:
dataset_clean_training_arm["labels"] = labels_clean_training_arm
dataset_clean_validation_arm["labels"] = labels_clean_validation_arm
dataset_clean_test_arm["labels"] = labels_clean_test_arm

dataset_clean_training_arm = dataset_clean_training_arm.sample(frac=1)
dataset_clean_validation_arm = dataset_clean_validation_arm.sample(frac=1)
dataset_clean_test_arm = dataset_clean_test_arm.sample(frac=1)

labels_clean_training_arm = np.asarray(dataset_clean_training_arm["labels"])
labels_clean_validation_arm = np.asarray(dataset_clean_validation_arm["labels"])
labels_clean_test_arm = np.asarray(dataset_clean_test_arm["labels"])

dataset_clean_training_arm = np.asarray(dataset_clean_training_arm.drop(columns=["labels"]))
dataset_clean_validation_arm = np.asarray(dataset_clean_validation_arm.drop(columns=["labels"]))
dataset_clean_test_arm = np.asarray(dataset_clean_test_arm.drop(columns=["labels"]))

KeyError: '[-1] not found in axis'

### Poisoned dataset
For the second model - we call it *Poisoned* - we will train on clean a poisoned dataset

In [None]:
dataset_poisoned_training_arm_malware = pd.read_csv(
    file_arm_malware,
    skiprows=lambda x: criteria_poisoned_training(x),
    index_col=False,
    header=None
)
dataset_poisoned_validation_arm_malware = pd.read_csv(
    file_arm_malware,
    skiprows=lambda x: criteria_poisoned_validation(x),
    index_col=False,
    header=None
)
dataset_poisoned_test_arm_malware = pd.read_csv(
    file_arm_malware,
    skiprows=lambda x: criteria_poisoned_test(x),
    index_col=False,
    header=None
)

dataset_poisoned_training_arm_benign = pd.read_csv(
    file_arm_benign,
    skiprows=lambda x: criteria_poisoned_training(x),
    index_col=False,
    header=None
)
dataset_poisoned_validation_arm_benign = pd.read_csv(
    file_arm_benign,
    skiprows=lambda x: criteria_poisoned_validation(x),
    index_col=False,
    header=None
)
dataset_poisoned_test_arm_benign = pd.read_csv(
    file_arm_benign,
    skiprows=lambda x: criteria_poisoned_test(x),
    index_col=False,
    header=None
)

dataset_poisoned_training_arm = dataset_poisoned_training_arm_malware.append(dataset_poisoned_training_arm_benign,
                                                                             ignore_index=True)
dataset_poisoned_validation_arm = dataset_poisoned_validation_arm_malware.append(dataset_poisoned_validation_arm_benign,
                                                                                 ignore_index=True)
dataset_poisoned_test_arm = dataset_poisoned_test_arm_malware.append(dataset_poisoned_test_arm_benign,
                                                                     ignore_index=True)

#### Labels for the poisoned datasets
Creating the labels for the poisoned datasets
*Malware* - $ 0 $, *Benign* - $ 1 $
-**Trainining**: 800 malware, 800 benign
-**Validation**: 100 malware, 100 benign
-**Test**: 100 malware, 100 benign

In [None]:
labels_poisoned_training_arm = ([0] * 800) + ([1] * 800)
labels_poisoned_validation_arm = ([0] * 100) + ([1] * 100)
labels_poisoned_test_arm = ([0] * 100) + ([1] * 100)

#### Shuffle the *poisoned* dataframe
First we append the labels to the Dataframe, then shuffle the whole Dataframe and finally separate the labels from the data

In [None]:
dataset_poisoned_training_arm["labels"] = labels_poisoned_training_arm
dataset_poisoned_validation_arm["labels"] = labels_poisoned_validation_arm
dataset_poisoned_test_arm["labels"] = labels_poisoned_test_arm

dataset_poisoned_training_arm = dataset_poisoned_training_arm.sample(frac=1)
dataset_poisoned_validation_arm = dataset_poisoned_validation_arm.sample(frac=1)
dataset_poisoned_test_arm = dataset_poisoned_test_arm.sample(frac=1)

labels_poisoned_training_arm = np.asarray(dataset_poisoned_training_arm["labels"])
labels_poisoned_validation_arm = np.asarray(dataset_poisoned_validation_arm["labels"])
labels_poisoned_test_arm = np.asarray(dataset_poisoned_test_arm["labels"])

dataset_poisoned_training_arm = np.asarray(dataset_poisoned_training_arm.drop(columns=["labels"]))
dataset_poisoned_validation_arm = np.asarray(dataset_poisoned_validation_arm.drop(columns=["labels"]))
dataset_poisoned_test_arm = np.asarray(dataset_poisoned_test_arm.drop(columns=["labels"]))

## Models for arm Datasets
In this section we will build the models with the exact same struture
Neural network with one *hidden layer* with a $sigmoid$ *activation function*

### Model for Clean Data
Building, fitting and evaluating the Model with the *clean* datasets

In [None]:
model_clean = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
model_clean.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                    metrics=[tf.keras.metrics.BinaryAccuracy()])
model_clean.fit(dataset_clean_training_arm, labels_clean_training_arm, epochs=10,
                validation_data=(dataset_clean_validation_arm, labels_clean_validation_arm))

model_clean.evaluate(dataset_clean_test_arm, labels_clean_test_arm)

### Model for Poisoned Data
Building, fitting and evaluating the Model with the *poisoned* datasets

In [None]:
model_poisoned = keras.Sequential(
    [
        layers.Dense(1, input_shape=(131,), activation="sigmoid")
    ]
)
model_poisoned.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                       metrics=[tf.keras.metrics.BinaryAccuracy()])
model_poisoned.fit(dataset_poisoned_training_arm, labels_poisoned_training_arm, epochs=10,
                   validation_data=(dataset_poisoned_validation_arm, labels_poisoned_validation_arm))

model_poisoned.evaluate(dataset_poisoned_test_arm, labels_poisoned_test_arm)