This notebook demonstrates using machine learning for detecting malicious software based on the system calls the software uses. It is done as a project for CS6456-Fall 2019 at The University of Virginia. Further details can be found here: http://www.cs.virginia.edu/~bjc8c/class/cs6456-f19/hw4.html. 

Below uses a simple three-layered (2 hidden, 1 output) neural network implemented in PyTorch.
Author: Hyun Jae Cho

In [1]:
import torch
import torch.nn as nn
from torch.utils import data

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm

In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
df = pd.read_csv("api_calls.csv", error_bad_lines=False)
df = df.sample(frac=1)
df.head()

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
22398,437ea215f277aa8f26cfc3d78293bc9c,240,117,240,117,240,117,240,117,240,...,15,240,117,240,117,240,117,172,60,1
26055,998c71564f5422d46cb4922e7d95e945,82,240,117,240,117,240,117,240,117,...,215,60,81,60,81,60,81,208,187,1
21552,096cddfffe60c02156959691e187b0f2,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,1
18102,216c9495d94be7ee47e09dad98f9aa68,82,86,82,37,70,37,240,117,260,...,215,178,215,260,65,260,141,65,20,1
20284,e9c2d6c78943ee4002c5fcab2c6df21c,82,240,117,240,117,240,117,240,117,...,224,82,208,159,224,82,240,117,159,1


## Split data into train, validation, and test sets

In [4]:
# split data into train 70%, validation 10%, and test 20%

X = df.drop(["malware"], axis=1)
y = df['malware']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)


X_val, X_test, y_val, y_test = train_test_split(X_test,
                                                  y_test,
                                                  test_size=0.66,
                                                  random_state=1)

X_train, X_val = X_train.drop(["hash"], axis=1), X_val.drop(["hash"], axis=1)

In [5]:
print("Size of train data (70%)")
print(X_train.shape, y_train.shape)
print("\nSize of validation data (10%)")
print(X_val.shape, y_val.shape)
print("\nSize of test data (20%)")
print(X_test.shape, y_test.shape)


Size of train data (70%)
(30713, 100) (30713,)

Size of validation data (10%)
(4475, 100) (4475,)

Size of test data (20%)
(8688, 101) (8688,)


In [6]:
class Dataset(data.Dataset):
    def __init__(self, dataset, labels):
        self.labels = labels
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        X = torch.tensor(np.int64(self.dataset.iloc[index]))
        y = self.labels.iloc[index]
        return X, y

class TestDataset(data.Dataset):
    def __init__(self, dataset, labels):
        self.labels = labels
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        syscall = list(self.dataset.iloc[index].values)
        syscall2 = self.dataset.iloc[index][1:]
        X = torch.tensor(np.int64(syscall2))
        y = self.labels.iloc[index]
        return X, y, syscall

    
# Data generator and loader
batch_size = 32

training_set = Dataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(dataset=training_set, 
                                           batch_size=batch_size,
                                          shuffle=True)

validation_set = Dataset(X_val, y_val)
validation_loader = torch.utils.data.DataLoader(dataset=validation_set, 
                                           batch_size=batch_size,
                                               shuffle=False)

test_set = TestDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(dataset=test_set, 
                                          batch_size=batch_size, 
                                          shuffle=False)

## Neural Net

In [7]:
# Hyper-parameters 
input_size = df.shape[1]-2
hidden_size = 400
num_classes = 2
num_epochs = 5
learning_rate = 0.001

In [8]:
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x.float())
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)

        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)


In [9]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

## Train, Validate, Test

In [10]:
# Train the model
total_step = len(train_loader)
print("Training")
for epoch in range(num_epochs):
    for i, (batch_data, batch_labels) in enumerate(train_loader):
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)

        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % total_step == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))


Training
Epoch [1/5], Step [960/960], Loss: 0.0368
Epoch [2/5], Step [960/960], Loss: 0.0211
Epoch [3/5], Step [960/960], Loss: 0.1191
Epoch [4/5], Step [960/960], Loss: 0.0033
Epoch [5/5], Step [960/960], Loss: 0.0048


In [11]:
print("Validation")
with torch.set_grad_enabled(False):
    correct = 0
    total = 0
    for batch_data, batch_labels in validation_loader:
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)

        outputs = model(batch_data)
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()

    print('Accuracy of the network on the {} validation data: {} %'.format(X_val.shape[0],
                                                                           100 * correct / total))


Validation
Accuracy of the network on the 4475 validation data: 98.52513966480447 %


In [12]:
print("Testing")
tofile = []
y_pred = []
with torch.set_grad_enabled(False):
    correct = 0
    total = 0
    for batch_data, batch_labels, syscall in test_loader:

        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)

        outputs = model(batch_data)
        _, predicted = torch.max(outputs.data, 1)
        y_pred += list(predicted)
        total += batch_labels.size(0)
        for i in range(len(predicted)):
            if predicted[i] == batch_labels[i]:
                correct += 1
                tofile.append((syscall[0][i], predicted[i].item(), batch_labels[i].item()))
                
    print('Accuracy of the network on the {} testing data: {} %'.format(X_test.shape[0],
                                                                        100 * correct / total))


Testing
Accuracy of the network on the 8688 testing data: 98.51519337016575 %


## Evaluate using F1, precision, recall, accuracy

In [13]:
precision, recall, f1, _ = precision_recall_fscore_support(y_true=y_test,
                                                           y_pred=y_pred,
                                                          average='macro')


In [14]:
print("precision is: ", precision)

precision is:  0.9103057889822597


In [15]:
print("recall is: ", recall)

recall is:  0.7380550653994021


In [16]:
print("f1 score is: ", f1)

f1 score is:  0.8001677108327183


## Write to file

In [17]:
f = open("classification.txt", "w")
for hash_str, prediction, actual in tofile:

    f.write(str(hash_str)+ "\t"+ str(prediction) + "\t"+ str(actual)+ "\n")
f.close()