### Malware Analysis Using Neural Networks
##### Using Keras and Tensorflow
##### Using Malconv architecture


In [1]:
import re
import collections
from os import listdir
from os.path import isfile, join
import binascii
import numpy as np
import pandas as pd
import keras
from keras.utils import to_categorical
import datetime

Using TensorFlow backend.


In [2]:
print (datetime.datetime.now())

2019-11-18 19:51:27.390070


In [64]:
TrainLabelspath = "/mnt/disks/MLProject/data/random_TrainLabels.csv"
Testabelspath = "/mnt/disks/MLProject/data/random_TestLabels.csv"  ##rand add
datasetPath = "/mnt/disks/MLProject/data/r_train_dataset/"
TestSamplesPath = "/mnt/disks/MLProject/data/r_test_dataset/"

In [4]:
def cleanByteSequence(byteSequence):
    dataStr = str(byteSequence)
    dataStr = dataStr[2:]
    dataStr = dataStr.replace("\\r\\n", " ")
    dataStr2 = re.sub(r'\b\w{8}\b','',dataStr)
    dataStr3 = re.sub(r'\s+'," ", dataStr2)
    return dataStr3

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
        return data
    
def embedByte(hex_string):
    scale = 16
    num_of_bits = 8
    if(hex_string[0] =="?"):
        hex_string = "00"
    if(hex_string[1] == "?"):
        hex_string[1] = "00"
        
    binary_string = bin(int(hex_string, scale)) [2:].zfill(num_of_bits)
    vec = np.zeros(8)
    for i in range(8):
        if(binary_string[i]=="1"):
            vec[i]=float(1)/16
        else:
            vec[i]=-float(1)/16
            
    return vec

In [5]:
#Embedding - Converting the input to a numerical format.
#Embed each byte into a eight dimensional vector.
# 1 -> 1/16 and 0 -> -1/16
hex_string = "A6"
scale = 16
num_of_bits = 8
binary_string = bin(int(hex_string, scale)) [2:].zfill(num_of_bits)
print(binary_string)
print(embedByte(hex_string))

10100110
[ 0.0625 -0.0625  0.0625 -0.0625 -0.0625  0.0625  0.0625 -0.0625]


In [6]:
samples = [f for f in listdir(datasetPath) if isfile(join(datasetPath, f))]
data = pd.read_csv(TrainLabelspath)
targetDict = data.set_index('Id').T.to_dict('records')[0]

In [7]:
print(datetime.datetime.now())

2019-11-18 19:51:27.698469


In [8]:
#Reading Samples
max_size = 30000
num_samples = len(samples)
Train_X = np.zeros((num_samples, 8, max_size))
Train_Y = np.zeros(num_samples)
fileNum = 0
for file in samples:
    filePath = join(datasetPath, file)
    sampleByteSequence = readFile(filePath)
    #print(fileNum)
    cleanedByteSequence = cleanByteSequence(sampleByteSequence)
    splitByteSequence = cleanedByteSequence.strip().split(" ")
    Train_Y[fileNum] = targetDict[file.split(".")[0]]
    
    for i in range(min(max_size, len(splitByteSequence))):
        Train_X[fileNum,:,i] = embedByte(splitByteSequence[i])
    fileNum += 1

In [9]:
print(datetime.datetime.now())

2019-11-18 21:00:06.045729


In [10]:
#Convert lables to categorical form
Train_Y_one_hot = to_categorical(Train_Y-1)
print(Train_Y_one_hot)
print(Train_Y_one_hot.shape)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
(5434, 9)


In [11]:
num_classes = 9
from keras import optimizers
from keras import Input
from keras.layers import Conv1D
from keras.layers import Activation 
from keras.layers import multiply
from keras.layers import GlobalMaxPool1D
from keras.layers import Dense
from keras import Model

In [12]:
opt = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

In [13]:
inputs = Input(shape=(8,max_size)) 

In [14]:
conv1 = Conv1D(kernel_size=(128), filters=32, strides=(128), padding='same')(inputs)

In [15]:
conv2 = Conv1D(kernel_size=(128), filters=32, strides=(128), padding='same')(inputs)

In [16]:
a = Activation('sigmoid', name='sigmoid')(conv2)

In [17]:
mul = multiply([conv1, a])

In [18]:
b = Activation('relu', name='relu')(mul)

In [19]:
p = GlobalMaxPool1D()(b)

In [20]:
d = Dense(16)(p)

In [21]:
predictions = Dense(num_classes, activation = 'sigmoid') (d)

In [22]:
model = Model(inputs=inputs, outputs=predictions)

In [23]:
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['acc'])

In [24]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 8, 30000)     0                                            
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 1, 32)        122880032   input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1, 32)        122880032   input_1[0][0]                    
__________________________________________________________________________________________________
sigmoid (Activation)            (None, 1, 32)        0           conv1d_2[0][0]                   
____________________________________________________________________________________________

In [25]:
batch_size = 16
num_batches = int(num_samples/batch_size)

In [26]:
for batch_num in range(num_batches):
    batch = Train_X[batch_num * batch_size:(batch_num+1)*batch_size]
    model.train_on_batch(batch, Train_Y_one_hot[batch_num * batch_size:(batch_num+1)*batch_size])

In [27]:
pred = model.predict(Train_X)

In [28]:
pred

array([[0.0000000e+00, 0.0000000e+00, 2.4160719e-01, ..., 2.9802322e-08,
        2.0861626e-07, 4.5299530e-06],
       [1.2130737e-03, 4.9374700e-03, 2.2247434e-04, ..., 2.6598573e-04,
        2.8541088e-03, 3.4771776e-01],
       [1.2807310e-02, 5.3510231e-01, 9.7155571e-06, ..., 5.5775046e-04,
        9.3835592e-04, 2.6255161e-02],
       ...,
       [5.8269501e-04, 9.2851663e-01, 0.0000000e+00, ..., 4.0531158e-06,
        4.8875809e-06, 2.0162463e-03],
       [2.4293095e-02, 4.9462318e-03, 2.3447573e-03, ..., 5.8829784e-04,
        3.9465427e-03, 7.5986981e-04],
       [2.8613210e-04, 8.8857412e-03, 8.9785457e-04, ..., 1.3402700e-03,
        7.0932927e-04, 1.9432865e-01]], dtype=float32)

In [29]:
pred_label = []
for row in pred:
    pred_label.append(np.argmax(row))

In [30]:
Train_Y_np = np.asarray(Train_Y-1)
print(Train_Y_np)

[2. 8. 1. ... 1. 5. 8.]


In [31]:
pred_label

[2,
 8,
 1,
 2,
 2,
 0,
 0,
 8,
 8,
 8,
 1,
 1,
 2,
 5,
 5,
 0,
 1,
 2,
 2,
 8,
 5,
 7,
 2,
 7,
 0,
 7,
 2,
 5,
 1,
 8,
 2,
 8,
 1,
 7,
 0,
 2,
 5,
 2,
 7,
 0,
 2,
 8,
 7,
 0,
 2,
 1,
 2,
 0,
 8,
 1,
 2,
 1,
 2,
 0,
 0,
 3,
 8,
 5,
 2,
 7,
 7,
 1,
 2,
 2,
 2,
 1,
 2,
 0,
 0,
 1,
 0,
 1,
 1,
 2,
 1,
 1,
 3,
 8,
 0,
 2,
 1,
 2,
 2,
 1,
 7,
 2,
 2,
 2,
 1,
 2,
 5,
 7,
 7,
 2,
 0,
 6,
 1,
 8,
 5,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 0,
 8,
 8,
 6,
 8,
 2,
 2,
 2,
 0,
 0,
 2,
 0,
 2,
 1,
 2,
 8,
 1,
 2,
 0,
 1,
 0,
 1,
 0,
 6,
 8,
 1,
 1,
 5,
 0,
 1,
 2,
 0,
 2,
 1,
 0,
 5,
 2,
 1,
 7,
 7,
 1,
 8,
 8,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 7,
 5,
 5,
 8,
 0,
 2,
 2,
 0,
 8,
 5,
 0,
 2,
 2,
 7,
 0,
 0,
 7,
 2,
 2,
 6,
 8,
 5,
 3,
 1,
 1,
 8,
 1,
 8,
 7,
 1,
 5,
 0,
 0,
 2,
 2,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 8,
 7,
 8,
 2,
 2,
 2,
 2,
 1,
 6,
 1,
 5,
 2,
 2,
 0,
 7,
 2,
 7,
 1,
 2,
 0,
 7,
 2,
 2,
 8,
 2,
 5,
 6,
 8,
 7,
 2,
 1,
 2,
 0,
 6,
 7,
 1,
 1,
 0,
 8,
 8,
 0,
 0,
 0,
 7,
 0,
 5,
 5,


In [32]:
str(sum(pred_label == Train_Y_np)/len(Train_Y_np)*100)+"%"

'95.96981965403018%'

In [None]:
print(datetime.datetime.now())

### Testing the Model

In [68]:
print(datetime.datetime.now())

2019-11-18 21:50:30.663494


In [69]:
testSamples = [f for f in listdir(TestSamplesPath) if isfile(join(TestSamplesPath, f))]
#Rand add
data_test = pd.read_csv(Testabelspath)
targetDict_test = data_test.set_index('Id').T.to_dict('records')[0]

In [70]:
max_size= 30000
num_samples = len(testSamples)
Test_X = np.zeros((num_samples, 8, max_size))
Test_Y = np.zeros(num_samples)
fileNum = 0

for file in testSamples:
    filePath = join(TestSamplesPath, file)
    sampleByteSequence = readFile(filePath)
    #print(fileNum)
    cleanedByteSequence = cleanByteSequence(sampleByteSequence)
    splitByteSequence = cleanedByteSequence.strip().split(" ")
    Test_Y[fileNum] = targetDict_test[file.split(".")[0]]
    for i in range(min(max_size, len(splitByteSequence))):
        Test_X[fileNum,:,i] = embedByte(splitByteSequence[i])
    fileNum += 1


In [71]:
print(datetime.datetime.now())

2019-11-18 23:02:34.319569


In [72]:
Test_Y_np = np.asarray(Test_Y-1)
Test_Y_pred = model.predict(Test_X)
Test_Y_pred_label = []
for row in Test_Y_pred:
    Test_Y_pred_label.append(np.argmax(row))

In [73]:
str(sum(Test_Y_pred_label == Test_Y_np)/len(Test_Y_np)*100)+"%"

'86.49245491350754%'

In [74]:
print(datetime.datetime.now())

2019-11-18 23:02:52.642673
