### Malware Analysis Using Neural Networks

In this implementation, we will be leveraging the Nvidia Malconv Architecture [1] to build the model for predicting the Malware.

<img src="Malconv-arc.png" alt="Malconv Architecture" height="300" width="500"/>

In [1]:
#Importing required libraries
import re
import collections
from os import listdir
from os.path import isfile, join
import binascii
import numpy as np
import pandas as pd
import keras
from keras.utils import to_categorical
import datetime

Using TensorFlow backend.


In [2]:
print (datetime.datetime.now())

2019-12-01 11:18:25.379445


### Data Engineering
We will clean the byte sequences for any unknown characters and do the embedding by converting the input to a numerical format - each byte into an eight dimensional vector.

In [3]:
#Input files Paths
TrainLabelspath = "/mnt/disks/MLProject/data/r_TrainLabels.csv"
TestLabelspath = "/mnt/disks/MLProject/data/r_Testlabels.csv" 

TraindatasetPath = "/mnt/disks/MLProject/data/r_train_dataset/"
TestdatasetPath = "/mnt/disks/MLProject/data/r_test_dataset/"

In [4]:
#Method for Cleaning the bytes
def cleanByteSequence(byteSequence):
    dataStr = str(byteSequence)
    dataStr = dataStr[2:]
    dataStr = dataStr.replace("\\r\\n", " ")
    dataStr2 = re.sub(r'\b\w{8}\b','',dataStr)
    dataStr3 = re.sub(r'\s+'," ", dataStr2)
    return dataStr3

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
        return data

#Method of Embedding bytes
def embedByte(hex_string):
    scale = 16
    num_of_bits = 8
    if(hex_string[0] =="?"):
        hex_string = "00"
    if(hex_string[1] == "?"):
        hex_string[1] = "00"
        
    binary_string = bin(int(hex_string, scale)) [2:].zfill(num_of_bits)
    vec = np.zeros(8)
    for i in range(8):
        if(binary_string[i]=="1"):
            vec[i]=float(1)/16
        else:
            vec[i]=-float(1)/16
            
    return vec

In [5]:
#Embedding - Converting the input to a numerical format.
#Embed each byte into a eight dimensional vector.
# 1 -> 1/16 and 0 -> -1/16
hex_string = "A6"
scale = 16
num_of_bits = 8
binary_string = bin(int(hex_string, scale)) [2:].zfill(num_of_bits)
print(binary_string)
print(embedByte(hex_string))

10100110
[ 0.0625 -0.0625  0.0625 -0.0625 -0.0625  0.0625  0.0625 -0.0625]


In [7]:
print(datetime.datetime.now())

2019-12-01 11:18:25.899135


In [6]:
samples = [f for f in listdir(TraindatasetPath) if isfile(join(TraindatasetPath, f))]
data = pd.read_csv(TrainLabelspath)
targetDict = data.set_index('Id').T.to_dict('records')[0]

In [8]:
#Reading Samples
max_size = 30000      #Number of bytes to read per sample
num_samples = len(samples)
Train_X = np.zeros((num_samples, 8, max_size))
Train_Y = np.zeros(num_samples)
fileNum = 0
for file in samples:
    filePath = join(TraindatasetPath, file)
    sampleByteSequence = readFile(filePath)
    #print(fileNum)
    cleanedByteSequence = cleanByteSequence(sampleByteSequence)
    splitByteSequence = cleanedByteSequence.strip().split(" ")
    Train_Y[fileNum] = targetDict[file.split(".")[0]]       #Stores the labels
    
    for i in range(min(max_size, len(splitByteSequence))):
        Train_X[fileNum,:,i] = embedByte(splitByteSequence[i]) #All embeded data 
    fileNum += 1

In [9]:
print(datetime.datetime.now())

2019-12-01 12:45:10.420769


In [10]:
#Convert lables to categorical form
Train_Y_one_hot = to_categorical(Train_Y-1)
print(Train_Y_one_hot)
print(Train_Y_one_hot.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
(6520, 9)


### Building the Model
Using Keras functional API, we will take two convolutions - conv1 and conv2 and then activate one of the convolutions, say conv2, sigmoid. We then multiply the Activated convolution and  non activated convolution and activate the final result - relu.

In [11]:
num_classes = 9
from keras import optimizers
from keras import Input
from keras.layers import Conv1D
from keras.layers import Activation 
from keras.layers import multiply
from keras.layers import GlobalMaxPool1D
from keras.layers import Dense
from keras import Model

In [12]:
opt = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

In [13]:
inputs = Input(shape=(8,max_size)) 

In [14]:
conv1 = Conv1D(kernel_size=(128), filters=32, strides=(128), padding='same')(inputs)

In [15]:
conv2 = Conv1D(kernel_size=(128), filters=32, strides=(128), padding='same')(inputs)

In [16]:
a = Activation('sigmoid', name='sigmoid')(conv2)  #Activation

In [17]:
mul = multiply([conv1, a]) #Multiplying Activated one and Non-Activated one.

In [18]:
a = Activation('relu', name='relu')(mul) #Activating the result - mul

In [19]:
p = GlobalMaxPool1D()(a)

In [20]:
d = Dense(16)(p)

In [21]:
predictions = Dense(num_classes, activation = 'sigmoid') (d)

In [22]:
model = Model(inputs=inputs, outputs=predictions)

In [23]:
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])

##### Model summary:

In [24]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 8, 30000)     0                                            
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 1, 32)        122880032   input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1, 32)        122880032   input_1[0][0]                    
__________________________________________________________________________________________________
sigmoid (Activation)            (None, 1, 32)        0           conv1d_2[0][0]                   
____________________________________________________________________________________________

### Training the Model with Train Dataset

In [26]:
batch_size = 16   #Dividing the dataset into multiple batches
num_batches = int(num_samples/batch_size)

for batch_num in range(num_batches):
    batch = Train_X[batch_num * batch_size:(batch_num+1)*batch_size]
    model.train_on_batch(batch, Train_Y_one_hot[batch_num * batch_size:(batch_num+1)*batch_size])

In [27]:
pred = model.predict(Train_X)

In [28]:
pred

array([[3.2068312e-01, 5.6424928e-01, 2.1557412e-01, ..., 1.1670318e-01,
        2.1745884e-01, 7.0136219e-01],
       [8.1635594e-02, 9.9390745e-04, 4.7683716e-07, ..., 4.0829182e-06,
        1.2341142e-04, 1.4496446e-03],
       [2.1457672e-06, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        4.1770387e-01, 1.4901161e-07],
       ...,
       [1.7415881e-03, 2.3207068e-04, 1.4215708e-05, ..., 1.5127659e-04,
        5.6892633e-04, 6.1125910e-01],
       [7.2476268e-04, 6.4472193e-01, 0.0000000e+00, ..., 8.9406967e-08,
        2.0861626e-07, 5.5789948e-05],
       [1.9618064e-02, 8.4722340e-03, 1.1920929e-07, ..., 1.1026859e-06,
        3.7610531e-05, 5.0544739e-04]], dtype=float32)

In [29]:
pred_label = []
for row in pred:
    pred_label.append(np.argmax(row))

In [30]:
Train_Y_np = np.asarray(Train_Y-1)
print(Train_Y_np)

[5. 0. 7. ... 8. 1. 1.]


In [31]:
pred_label

[5,
 0,
 7,
 6,
 2,
 8,
 2,
 1,
 7,
 2,
 1,
 2,
 1,
 0,
 7,
 8,
 1,
 1,
 8,
 2,
 7,
 0,
 7,
 0,
 7,
 2,
 5,
 2,
 8,
 1,
 8,
 2,
 8,
 7,
 2,
 7,
 2,
 0,
 6,
 1,
 2,
 8,
 7,
 0,
 1,
 0,
 8,
 7,
 7,
 0,
 6,
 1,
 2,
 1,
 2,
 8,
 7,
 7,
 5,
 2,
 3,
 1,
 2,
 2,
 7,
 1,
 5,
 2,
 5,
 2,
 2,
 2,
 0,
 0,
 1,
 2,
 1,
 1,
 8,
 0,
 1,
 0,
 8,
 0,
 2,
 0,
 2,
 8,
 2,
 1,
 0,
 0,
 0,
 0,
 7,
 0,
 8,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 5,
 8,
 7,
 8,
 1,
 1,
 6,
 2,
 1,
 8,
 1,
 2,
 0,
 2,
 2,
 3,
 2,
 0,
 2,
 8,
 7,
 2,
 5,
 2,
 2,
 2,
 0,
 1,
 0,
 2,
 1,
 8,
 2,
 8,
 2,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 8,
 0,
 1,
 1,
 0,
 1,
 2,
 0,
 2,
 0,
 5,
 7,
 3,
 1,
 2,
 2,
 8,
 8,
 0,
 3,
 1,
 2,
 2,
 2,
 1,
 0,
 1,
 1,
 1,
 2,
 1,
 2,
 8,
 3,
 2,
 2,
 1,
 7,
 8,
 8,
 0,
 7,
 0,
 7,
 2,
 0,
 2,
 8,
 2,
 5,
 1,
 1,
 0,
 1,
 1,
 8,
 7,
 1,
 1,
 2,
 5,
 5,
 3,
 0,
 0,
 1,
 2,
 7,
 1,
 0,
 1,
 1,
 0,
 1,
 8,
 2,
 5,
 0,
 7,
 1,
 1,
 1,
 0,
 2,
 1,
 2,
 1,
 0,
 2,
 5,
 2,
 2,
 2,
 7,
 2,
 1,
 1,
 5,
 2,
 2,
 1,
 5,
 2,
 8,


In [32]:
str(sum(pred_label == Train_Y_np)/len(Train_Y_np)*100)+"%"

'96.74846625766871%'

In [33]:
print(datetime.datetime.now())

2019-12-01 13:14:47.307690


### Testing the Model

In [34]:
testSamples = [f for f in listdir(TestdatasetPath) if isfile(join(TestdatasetPath, f))]
data_test = pd.read_csv(TestLabelspath)
targetDict_test = data_test.set_index('Id').T.to_dict('records')[0]

In [35]:
Test_X = np.zeros((num_samples, 8, max_size))
Test_Y = np.zeros(num_samples)
fileNum = 0

for file in testSamples:
    filePath = join(TestdatasetPath, file)
    sampleByteSequence = readFile(filePath)
    #print(fileNum)
    cleanedByteSequence = cleanByteSequence(sampleByteSequence)
    splitByteSequence = cleanedByteSequence.strip().split(" ")
    Test_Y[fileNum] = targetDict_test[file.split(".")[0]]
    for i in range(min(max_size, len(splitByteSequence))):
        Test_X[fileNum,:,i] = embedByte(splitByteSequence[i])
    fileNum += 1

In [36]:
Test_Y_np = np.asarray(Test_Y-1)
Test_Y_pred = model.predict(Test_X)
Test_Y_pred_label = []
for row in Test_Y_pred:
    Test_Y_pred_label.append(np.argmax(row))

In [37]:
str(sum(Test_Y_pred_label == Test_Y_np)/len(Test_Y_np)*100)+"%"

'58.11349693251534%'

In [38]:
print(datetime.datetime.now())

2019-12-01 14:11:40.916807


### Predict on Test Dataset
Running the model to predict the classification on the "given" Test dataset, which don't have labels.

In [None]:
o_TestLabelspath = "/mnt/disks/MLProject/data/testLabels.csv" 
o_TestdatasetPath = "/mnt/disks/MLProject/data/test_dataset"

testSamples = [f for f in listdir(o_TestdatasetPath) if isfile(join(o_TestdatasetPath, f))]
data_test = pd.read_csv(o_TestLabelspath)
targetDict_test_o = data_test.set_index('Id').T.to_dict('records')[0]

testSamples = testSamples[:5000]

Testo_X = np.zeros((num_samples, 8, max_size))
Testo_Y = np.zeros(num_samples)
fileNum = 0

for file in testSamples:
    filePath = join(o_TestdatasetPath, file)
    sampleByteSequence = readFile(filePath)
    cleanedByteSequence = cleanByteSequence(sampleByteSequence)
    splitByteSequence = cleanedByteSequence.strip().split(" ")
    Testo_Y[fileNum] = targetDict_test_o[file.split(".")[0]]
    for i in range(min(max_size, len(splitByteSequence))):
        Testo_X[fileNum,:,i] = embedByte(splitByteSequence[i])
    fileNum += 1

Testo_Y_np = np.asarray(Testo_Y-1)
Testo_Y_pred = model.predict(Testo_X)
Testo_Y_pred_label = []

for row in Testo_Y_pred:
    Testo_Y_pred_label.append(np.argmax(row))
  


In [None]:
print(datetime.datetime.now())

In [None]:
Testo_Y_pred_label

### References:  
[1] Raff, E., Barker, J., Sylvester, J., Brandon, R., Catanzaro, B., & Nicholas, C.K. (2017). Malware Detection by Eating a Whole EXE. ArXiv, abs/1710.09435.  
[2] Dataset: https://www.kaggle.com/c/malware-classification/data 