In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

## Read the training and test data sets previously split and stored as separage csv files
## The last column labelled '86' is the 0/1 label indicationg benign/malware for each sample row.

df_train = pd.read_csv("data/train_data.csv")
df_test = pd.read_csv("data/test_data.csv")

last_column = df_train.shape[1] - 1 ## index of the label column
X_train = df_train.iloc[:,:last_column]
y_train = df_train.iloc[:,last_column]

X_test = df_test.iloc[:,:last_column]
y_test = df_test.iloc[:,last_column]

In [2]:
import tensorflow as tf
from tensorflow import keras



In [3]:
X_train.shape

(20532, 74)

In [4]:
def create_model(learning_rate=0.00010):
    model = keras.models.Sequential()
    model.add(keras.Input(shape=(74,)))
    model.add(keras.layers.Dense(148,activation='relu'))
    model.add(keras.layers.Dense(74,activation='relu'))
    model.add(keras.layers.Dense(37,activation='relu'))
    model.add(keras.layers.Dense(1,activation='sigmoid'))
    model.summary()
    opt = keras.optimizers.Adam(learning_rate=learning_rate)
    loss = keras.losses.BinaryCrossentropy()
    model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
    return model


In [5]:
model = create_model(learning_rate=0.00010)
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
model.fit(x=X_train, y=y_train, verbose=0, validation_split=0.2, epochs=200, callbacks=[callback])
model.evaluate(x=X_test, y=y_test)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 148)               11100     
_________________________________________________________________
dense_1 (Dense)              (None, 74)                11026     
_________________________________________________________________
dense_2 (Dense)              (None, 37)                2775      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 38        
Total params: 24,939
Trainable params: 24,939
Non-trainable params: 0
_________________________________________________________________
Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping


[0.10730338096618652, 0.9649999737739563]

In [6]:
yp = model.predict(X_test)
yp = yp > 0.5
print('acc', accuracy_score(y_test, yp))
print('recall', recall_score(y_test, yp))
print('precision', precision_score(y_test, yp))
print('F1', f1_score(y_test, yp))
pd.DataFrame(confusion_matrix(y_test,yp))

acc 0.965
recall 0.9584456424079065
precision 0.9719817767653759
F1 0.9651662519791903


Unnamed: 0,0,1
0,4225,123
1,185,4267
