In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from datetime import datetime
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch_geometric as pyg
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from models import GCN
from utils import read_pickle, split_df, get_file_paths, DataProcessor

print('Versions')
print('-' * 20)
print(f'{"Pytorch":<20} {torch.__version__}')
print(f'{"Pytorch-Geometric":<20} {pyg.__version__}')
print(f'{"Pytorch-Lightning":<20} {pl.__version__}')
print(f'{"MLFlow":<20} {mlflow.__version__}')
print('-' * 20)
print(f'{"GPU Support":<20} {torch.cuda.is_available()}')

Versions
--------------------
Pytorch              1.13.1+cu117
Pytorch-Geometric    2.2.0
Pytorch-Lightning    1.8.6
MLFlow               2.3.1
--------------------
GPU Support          True


In [2]:
detector = True

data_info = 'dataset.csv'
fcg_dir = 'FCGs/'
w2v_path = 'model_saved/word2vec.wordvectors'
gnn_path = 'model_saved/gcn.pt'

fpaths = get_file_paths(fcg_dir)
fnames = [os.path.splitext(os.path.basename(path))[0] for path in fpaths]
exist_df = pd.DataFrame({'sha256': fnames, 'path': fpaths})

islab_names = {'sha256': str, 'family': str, 'threshold': str, 'arch': str}
islab_df = pd.read_csv(data_info, low_memory=False, names=islab_names, dtype=islab_names, skiprows=1)
islab_df = islab_df[~islab_df.family.isin(['Unknown'])]
ds_df = islab_df[islab_df.arch.isin(['x86el', 'x86_64el'])]
ds_df = exist_df.merge(ds_df, how='inner')

mal_dict = {fam: int(fam!='BenignWare') for fam in ds_df.family.unique()}
family_dict = {fam: i for i, fam in enumerate(ds_df.family.unique())}

ds_df = ds_df.assign(label=ds_df.family.map(family_dict))
ds_df = ds_df.assign(malicious=ds_df.family.map(mal_dict))
ds_df = ds_df.reset_index(drop=True)
ds_df.head()

Unnamed: 0,sha256,path,family,threshold,arch,label,malicious
0,1263d19ea264fd53e7d21755752b37101ba247ed6d0e24...,FCGs/mal_graphs/1263d19ea264fd53e7d21755752b37...,Mirai,True,x86el,0,1
1,48309f7ef98e9597eedacc02fba7ba3761e6f00712adbb...,FCGs/mal_graphs/48309f7ef98e9597eedacc02fba7ba...,Mirai,True,x86el,0,1
2,f399487df0dd96d02f6a4a95b1dd4f6e1934d92463d06f...,FCGs/mal_graphs/f399487df0dd96d02f6a4a95b1dd4f...,Mirai,True,x86el,0,1
3,a9451891dd42875fb275a14cf7b5970d3de488f6557d12...,FCGs/mal_graphs/a9451891dd42875fb275a14cf7b597...,Bashlite,True,x86el,1,1
4,80dca6a3359ca5becb1d1bf0cf405249b6200caa2c97bd...,FCGs/mal_graphs/80dca6a3359ca5becb1d1bf0cf4052...,Mirai,True,x86el,0,1


In [3]:
with open('names', 'r') as f:
    names = [name.strip() for name in f.readlines()]

ds_islab = ds_df[ds_df.sha256.isin(names)].reset_index(drop=True)
ds_islab

Unnamed: 0,sha256,path,family,threshold,arch,label,malicious
0,1263d19ea264fd53e7d21755752b37101ba247ed6d0e24...,FCGs/mal_graphs/1263d19ea264fd53e7d21755752b37...,Mirai,True,x86el,0,1
1,24d1f3cf1b59b3820c646e9321b01421886b85fa593858...,FCGs/mal_graphs/24d1f3cf1b59b3820c646e9321b014...,Mirai,True,x86el,0,1
2,07545ca8f66e704fd2ef2ae9e126a237d19c8816de6c83...,FCGs/mal_graphs/07545ca8f66e704fd2ef2ae9e126a2...,Bashlite,True,x86_64el,1,1
3,15b2d0442ce4c23ff41253a91fa4917645c1263c4d7e76...,FCGs/mal_graphs/15b2d0442ce4c23ff41253a91fa491...,Bashlite,True,x86el,1,1
4,13b374a94a2694302e4cc9efb396ff6b80cda757b4a424...,FCGs/mal_graphs/13b374a94a2694302e4cc9efb396ff...,Mirai,True,x86el,0,1
...,...,...,...,...,...,...,...
4960,18205bf131a3c053f5f3bbc7fa7acaeeaf00fc1874bf7a...,FCGs/mal_graphs/18205bf131a3c053f5f3bbc7fa7aca...,Bashlite,True,x86_64el,1,1
4961,1c868ea2c62e0b907c73daf7c7b294ad1dd92afe629532...,FCGs/mal_graphs/1c868ea2c62e0b907c73daf7c7b294...,Bashlite,False,x86_64el,1,1
4962,15a8a38894c8a88657ab2ee9eb68b13e07cc62d3c730a3...,FCGs/mal_graphs/15a8a38894c8a88657ab2ee9eb68b1...,Mirai,True,x86el,0,1
4963,25d667a13111b107bb2da8d428e47cfa5c265b88ad508f...,FCGs/mal_graphs/25d667a13111b107bb2da8d428e47c...,Bashlite,True,x86el,1,1


In [9]:
ds_df.family.value_counts()

BenignWare    14659
Bashlite      12541
Mirai         11552
Android        1993
Tsunami        1402
Xorddos         603
Dofloo          594
Pnscan           13
Name: family, dtype: int64

In [10]:
processor = DataProcessor()
processor.load(w2v_path)

def get_data(path, label):
    G = read_pickle(path)
    if len(G.edges) == 0:
        # will be filtered out later
        return None
    
    x, edge_index = processor.from_networkx(G)
    return Data(x=x, edge_index=edge_index, y=label)

In [11]:
num_train = 10000
num_valid = 2000
workers = 20
column = 'malicious' if detector else 'label'

train_df, valid_df = split_df(df=ds_df,    n_or_frac=num_train, column=column, shuffle=True, allow_lower_n=True)
valid_df, test_df  = split_df(df=valid_df, n_or_frac=num_valid, column=column, shuffle=True, allow_lower_n=True)
train_pairs = train_df[['path', column]].to_numpy()
valid_pairs = valid_df[['path', column]].to_numpy()

print('Processing Training ...')
train_ds = Parallel(n_jobs=workers)(delayed(get_data)(path, label) for path, label in tqdm(train_pairs))
train_ds = [data for data in train_ds if not data is None]
print('Processing Validation ...')
valid_ds = Parallel(n_jobs=workers)(delayed(get_data)(path, label) for path, label in tqdm(valid_pairs))
valid_ds = [data for data in valid_ds if not data is None]

print(f'Train in total: {len(train_ds)}')
print(f'Valid in total: {len(valid_ds)}')

Processing Training ...


100%|██████████| 20000/20000 [13:42<00:00, 24.31it/s]


Processing Validation ...


100%|██████████| 4000/4000 [02:26<00:00, 27.22it/s]


Train in total: 19576
Valid in total: 3906


In [12]:
def function_timer(some_function):
    from time import time

    def wrapper(*args, **kwargs):
        t1 = time()
        result = some_function(*args, **kwargs)
        end = time()-t1
        return result, end
    return wrapper

@function_timer
def train():
    model.train()

    for data in train_loader:                             # Iterate in batches over the training dataset.
        data.to(device)                                   # Train the data if gpu is available
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)                     # Compute the loss.
        # print(help(loss))
        loss.backward()                                   # Derive gradients.
        optimizer.step()                                  # Update parameters based on gradients.
        optimizer.zero_grad()                             # Clear gradients.

def test(loader):
    model.eval()

    loss = 0
    correct = 0
    for data in loader:                                   # Iterate in batches over the training/test dataset.
        data.to(device)                                   # Train the data if gpu is available
        out = model(data.x, data.edge_index, data.batch)  # Predict the outcome by trained model
        pred = out.argmax(dim=1)                          # Use the class with highest probability.
        
        correct += int((pred == data.y).sum())            # Check against ground-truth labels.
        loss += criterion(out, data.y).item()             # Get the loss accumulated of each data sample
        
    acc = correct / len(loader.dataset)                   # Get the accuracy
    avg_loss = loss / len(loader.dataset)                 # Get the average loss
    
    return (acc, avg_loss)                                # Return the accuracy and average loss

In [13]:
batch_size = 64

train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, drop_last=True, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=128, num_workers=0, drop_last=True)

In [14]:
data_sample = train_loader.dataset[0]
# data_sample.num_node_features
# data_sample.num_classes

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(num_features=data_sample.num_node_features, hidden_channels=64, num_classes=len(ds_df[column].unique())).to(device)
print(f'Device: {device}')
print()
print(f'Model:\n{model}')

Device: cuda

Model:
GCN(
  (conv1): GCNConv(128, 64)
  (conv2): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
epochs = 20

In [17]:
### WARNING: If computer get overheated, the training process will become unstable

## Initialize the numpy array for accuracy and loss
train_acc = np.zeros(epochs)
train_loss = np.zeros(epochs)
val_acc = np.zeros(epochs)
val_loss = np.zeros(epochs)

best_val_acc = 0
for epoch in range(epochs):
    epoch_start = f'Epoch ({epoch + 1}/{epochs})'
    print(epoch_start, end=' ')
    
    # since train() returns nothing, so ignore the return with '_' and fetch the time taken
    _, _time = train()
    
    # evaluate the training accuracy and validation accuracy after epoch epoch
    train_acc[epoch], train_loss[epoch] = test(train_loader)
    val_acc[epoch], val_loss[epoch] = test(valid_loader)
    
    if val_acc[epoch] > best_val_acc:
        # save the best model according to validation accuracy
        best_val_acc = val_acc[epoch]
        torch.save(model, gnn_path)
    
    print(f'Train Acc: {train_acc[epoch]:.4f}, Train Loss: {train_loss[epoch]:>7.6f}', end=', ')
    print(f'Val Acc: {val_acc[epoch]:.4f}, Val Loss: {val_loss[epoch]:>7.6f}', end=' -- ')
    print(f'Training Time: {_time:.2f}s')

Epoch (1/10) Train Acc: 0.9841, Train Loss: 0.050839, Val Acc: 0.9711, Val Loss: 0.047710 -- Training Time: 6.52s
Epoch (2/10) Train Acc: 0.9864, Train Loss: 0.033898, Val Acc: 0.9741, Val Loss: 0.030060 -- Training Time: 6.52s
Epoch (3/10) Train Acc: 0.9868, Train Loss: 0.030590, Val Acc: 0.9721, Val Loss: 0.022408 -- Training Time: 6.52s
Epoch (4/10) Train Acc: 0.9900, Train Loss: 0.014174, Val Acc: 0.9759, Val Loss: 0.009709 -- Training Time: 6.52s
Epoch (5/10) Train Acc: 0.9844, Train Loss: 0.029161, Val Acc: 0.9713, Val Loss: 0.025258 -- Training Time: 6.58s
Epoch (6/10) Train Acc: 0.9900, Train Loss: 0.013536, Val Acc: 0.9752, Val Loss: 0.008985 -- Training Time: 6.52s
Epoch (7/10) Train Acc: 0.9921, Train Loss: 0.007104, Val Acc: 0.9767, Val Loss: 0.004820 -- Training Time: 6.52s
Epoch (8/10) Train Acc: 0.9936, Train Loss: 0.010532, Val Acc: 0.9777, Val Loss: 0.007781 -- Training Time: 6.52s
Epoch (9/10) Train Acc: 0.9863, Train Loss: 0.018780, Val Acc: 0.9716, Val Loss: 0.01344