In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from datetime import datetime
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch_geometric as pyg
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from models import GCN
from utils import read_pickle, split_df, get_file_paths, DataProcessor

print('Versions')
print('-' * 20)
print(f'{"Pytorch":<20} {torch.__version__}')
print(f'{"Pytorch-Geometric":<20} {pyg.__version__}')
print('-' * 20)
print(f'{"GPU Support":<20} {torch.cuda.is_available()}')

Versions
--------------------
Pytorch              1.13.1+cu117
Pytorch-Geometric    2.2.0
--------------------
GPU Support          True


In [2]:
detector = True

data_info = 'dataset.csv'
fcg_dir = 'FCGs/'
w2v_path = 'model_saved/word2vec.wordvectors'
gnn_path = 'model_saved/gcn.pt'

fpaths = get_file_paths(fcg_dir)
fnames = [os.path.splitext(os.path.basename(path))[0] for path in fpaths]
exist_df = pd.DataFrame({'sha256': fnames, 'path': fpaths})

islab_names = {'sha256': str, 'family': str, 'threshold': str, 'arch': str}
islab_df = pd.read_csv(data_info, low_memory=False, names=islab_names, dtype=islab_names, skiprows=1)
islab_df = islab_df[~islab_df.family.isin(['Unknown'])]
ds_df = islab_df[islab_df.arch.isin(['x86el', 'x86_64el'])]
ds_df = exist_df.merge(ds_df, how='inner')

mal_dict = {fam: int(fam!='BenignWare') for fam in ds_df.family.unique()}
family_dict = {fam: i for i, fam in enumerate(ds_df.family.unique())}

ds_df = ds_df.assign(label=ds_df.family.map(family_dict))
ds_df = ds_df.assign(malicious=ds_df.family.map(mal_dict))
ds_df = ds_df.reset_index(drop=True)
ds_df.head()

Unnamed: 0,sha256,path,family,threshold,arch,label,malicious
0,1263d19ea264fd53e7d21755752b37101ba247ed6d0e24...,FCGs/mal_graphs/1263d19ea264fd53e7d21755752b37...,Mirai,True,x86el,0,1
1,48309f7ef98e9597eedacc02fba7ba3761e6f00712adbb...,FCGs/mal_graphs/48309f7ef98e9597eedacc02fba7ba...,Mirai,True,x86el,0,1
2,f399487df0dd96d02f6a4a95b1dd4f6e1934d92463d06f...,FCGs/mal_graphs/f399487df0dd96d02f6a4a95b1dd4f...,Mirai,True,x86el,0,1
3,a9451891dd42875fb275a14cf7b5970d3de488f6557d12...,FCGs/mal_graphs/a9451891dd42875fb275a14cf7b597...,Bashlite,True,x86el,1,1
4,80dca6a3359ca5becb1d1bf0cf405249b6200caa2c97bd...,FCGs/mal_graphs/80dca6a3359ca5becb1d1bf0cf4052...,Mirai,True,x86el,0,1


In [3]:
ds_df.family.value_counts()

BenignWare    14659
Bashlite      12541
Mirai         11552
Android        1993
Tsunami        1402
Xorddos         603
Dofloo          594
Pnscan           13
Name: family, dtype: int64

In [4]:
processor = DataProcessor()
processor.load(w2v_path)

def get_data(path, label):
    G = read_pickle(path)
    if len(G.edges) == 0:
        # will be filtered out later
        return None
    
    x, edge_index = processor.from_networkx(G)
    return Data(x=x, edge_index=edge_index, y=label)

In [5]:
num_train = 10
num_valid = 200
workers = 20
column = 'malicious' if detector else 'label'

train_df, valid_df = split_df(df=ds_df,    n_or_frac=num_train, column=column, shuffle=True, allow_lower_n=True)
valid_df, test_df  = split_df(df=valid_df, n_or_frac=num_valid, column=column, shuffle=True, allow_lower_n=True)
train_pairs = train_df[['path', column]].to_numpy()
valid_pairs = valid_df[['path', column]].to_numpy()

print('Processing Training ...')
train_ds = Parallel(n_jobs=workers)(delayed(get_data)(path, label) for path, label in tqdm(train_pairs))
train_ds = [data for data in train_ds if not data is None]
print('Processing Validation ...')
valid_ds = Parallel(n_jobs=workers)(delayed(get_data)(path, label) for path, label in tqdm(valid_pairs))
valid_ds = [data for data in valid_ds if not data is None]

print(f'Train in total: {len(train_ds)}')
print(f'Valid in total: {len(valid_ds)}')

Processing Training ...


100%|██████████| 20/20 [00:00<00:00, 70.20it/s]


Processing Validation ...


100%|██████████| 400/400 [00:12<00:00, 31.19it/s]


Train in total: 20
Valid in total: 388


In [11]:
from time import time

class TorchTrainer:
    def __init__(self, model, optimizer=None, criterion=None, device=None):
        self.model     = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device
                
        self.arguments = locals()
        self.arguments['device'] = self.device
    
    def train(self, train_loader, valid_loader, epochs=20, save_path='model_saved/gcn_model.pt', verbose=True):
        self.arguments['epochs'] = epochs
        self.arguments['save_path'] = save_path
        
        train_acc  = np.zeros(epochs)
        train_loss = np.zeros(epochs)
        val_acc    = np.zeros(epochs)
        val_loss   = np.zeros(epochs)
        train_time = np.zeros(epochs)
        
        best_val_acc = 0
        for epoch in range(epochs):
            if verbose:
                epoch_start = f'Epoch ({epoch + 1}/{epochs})'
                print(epoch_start, end=' ')

            train_time[epoch] = self.train_epoch(train_loader)

            # evaluate the training accuracy and validation accuracy after each epoch
            train_acc[epoch], train_loss[epoch] = self.test(train_loader)
            val_acc[epoch], val_loss[epoch] = self.test(valid_loader)

            if val_acc[epoch] > best_val_acc:
                # save the best model according to validation accuracy
                best_val_acc = val_acc[epoch]
                torch.save(self.model, save_path)
            
            if verbose:
                print(f'Train Acc: {train_acc[epoch]:.4f}, Train Loss: {train_loss[epoch]:>7.6f}', end=', ')
                print(f'Val Acc: {val_acc[epoch]:.4f}, Val Loss: {val_loss[epoch]:>7.6f}', end=' -- ')
                print(f'Training Time: {train_time[epoch]:.2f}s')
        
        self.history = {'train_acc':  train_acc, 
                        'train_loss': train_loss, 
                        'val_acc':    val_acc, 
                        'val_loss':   val_loss, 
                        'time':       train_time}

    def train_epoch(self, train_loader):
        start = time()
        
        self.model.train()
        for data in train_loader:               # Iterate in batches over the training dataset.
            data.to(self.device)                # Train the data if gpu is available
            out = self.model(data)              # Perform a single forward pass.
            loss = self.criterion(out, data.y)  # Compute the loss.
            
            loss.backward()                     # Derive gradients.
            self.optimizer.step()               # Update parameters based on gradients.
            self.optimizer.zero_grad()          # Clear gradients.
        
        end = time()
        return end - start

    def test(self, loader):
        self.model.eval()

        loss = 0
        correct = 0
        for data in loader:                             # Iterate in batches over the training/test dataset.
            data.to(self.device)                        # Train the data if gpu is available
            out = self.model(data)                      # Predict the outcome by trained model
            pred = out.argmax(dim=1)                    # Use the class with highest probability.

            correct += int((pred == data.y).sum())      # Check against ground-truth labels.
            loss += self.criterion(out, data.y).item()  # Get the loss accumulated of each data sample

        acc = correct / len(loader.dataset)             # Get the accuracy
        avg_loss = loss / len(loader.dataset)           # Get the average loss
        return (acc, avg_loss)                          # Return the accuracy and average loss
    
    def load(self, path):
        self.model = torch.load(path)
        self.model.eval()

    def predict(self, loader):
        preds = []
        with torch.no_grad():
            for data in loader:
                data.to(self.device)
                pred = self.model.predict_prob(data).cpu().detach()
                preds.append(pred)
        preds = torch.vstack(preds)
        return preds

In [7]:
epochs     = 20
batch_size = 64
device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_features = train_ds[0].num_node_features
num_classes  = len(ds_df[column].unique())
train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, drop_last=True, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=128, num_workers=0, drop_last=True)

In [8]:
model = GCN(num_features=num_features, hidden_channels=64, num_classes=num_classes).to(device)

# define device of model before sending to the optimizer model.parameters() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

print(f'Device: {device}\n')
print(f'Model: \n{model}\n')
print(f'Optimizer: \n{optimizer}\n')
print(f'Criterion: {criterion}\n')

Device: cuda

Model: 
GCN(
  (conv1): GCNConv(128, 64)
  (conv2): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)

Optimizer: 
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: False
    lr: 0.01
    maximize: False
    weight_decay: 0
)

Criterion: CrossEntropyLoss()



In [9]:
trainer = TorchTrainer(model, optimizer=optimizer, criterion=criterion, device=device)
trainer.train(train_loader=train_loader, 
              valid_loader=valid_loader, 
              epochs=20, 
              save_path='model_saved/gcn_model.pt', 
              verbose=True)

Epoch (1/20) Train Acc: 0.9868, Train Loss: 0.001252, Val Acc: 0.9709, Val Loss: 0.000903 -- Training Time: 2.34s
Epoch (2/20) Train Acc: 0.9849, Train Loss: 0.000917, Val Acc: 0.9699, Val Loss: 0.000723 -- Training Time: 1.70s
Epoch (3/20) Train Acc: 0.9837, Train Loss: 0.001079, Val Acc: 0.9694, Val Loss: 0.000656 -- Training Time: 1.76s
Epoch (4/20) Train Acc: 0.9815, Train Loss: 0.001155, Val Acc: 0.9694, Val Loss: 0.000624 -- Training Time: 2.02s
Epoch (5/20) Train Acc: 0.9760, Train Loss: 0.173333, Val Acc: 0.9591, Val Loss: 0.114253 -- Training Time: 2.01s
Epoch (6/20) Train Acc: 0.9608, Train Loss: 0.042015, Val Acc: 0.9401, Val Loss: 0.030211 -- Training Time: 2.03s
Epoch (7/20) Train Acc: 0.9835, Train Loss: 0.056541, Val Acc: 0.9663, Val Loss: 0.022814 -- Training Time: 1.83s
Epoch (8/20) Train Acc: 0.9898, Train Loss: 0.015491, Val Acc: 0.9743, Val Loss: 0.010488 -- Training Time: 1.93s
Epoch (9/20) Train Acc: 0.9727, Train Loss: 0.296055, Val Acc: 0.9607, Val Loss: 0.12309

### prediction

In [12]:
new_model = GCN(num_features=num_features, hidden_channels=64, num_classes=num_classes).to(device)
new_trainer = TorchTrainer(new_model)
new_trainer.load('model_saved/gcn_model.pt')

In [21]:
predictions = new_trainer.predict(valid_loader)
predictions.argmax(dim=1)
# predictions.shape

tensor([0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,

In [22]:
real_label = [data.y for data in valid_loader.dataset]
torch.tensor(real_label)

tensor([0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,

In [21]:
def function_timer(some_function):
    from time import time

    def wrapper(*args, **kwargs):
        t1 = time()
        result = some_function(*args, **kwargs)
        end = time()-t1
        return result, end
    return wrapper

@function_timer
def train():
    model.train()

    for data in train_loader:                             # Iterate in batches over the training dataset.
        data.to(device)                                   # Train the data if gpu is available
        out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        loss = criterion(out, data.y)                     # Compute the loss.
        # print(help(loss))
        loss.backward()                                   # Derive gradients.
        optimizer.step()                                  # Update parameters based on gradients.
        optimizer.zero_grad()                             # Clear gradients.

def test(loader):
    model.eval()

    loss = 0
    correct = 0
    for data in loader:                                   # Iterate in batches over the training/test dataset.
        data.to(device)                                   # Train the data if gpu is available
        out = model(data.x, data.edge_index, data.batch)  # Predict the outcome by trained model
        pred = out.argmax(dim=1)                          # Use the class with highest probability.
        
        correct += int((pred == data.y).sum())            # Check against ground-truth labels.
        loss += criterion(out, data.y).item()             # Get the loss accumulated of each data sample
        
    acc = correct / len(loader.dataset)                   # Get the accuracy
    avg_loss = loss / len(loader.dataset)                 # Get the average loss
    
    return (acc, avg_loss)                                # Return the accuracy and average loss

In [22]:
batch_size = 64

train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, drop_last=True, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=128, num_workers=0, drop_last=True)

In [23]:
data_sample = train_loader.dataset[0]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(num_features=data_sample.num_node_features, hidden_channels=64, num_classes=len(ds_df[column].unique())).to(device)
print(f'Device: {device}')
print()
print(f'Model:\n{model}')

Device: cuda

Model:
GCN(
  (conv1): GCNConv(128, 64)
  (conv2): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [24]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
epochs = 20

In [25]:
### WARNING: If computer get overheated, the training process will become unstable

## Initialize the numpy array for accuracy and loss
train_acc = np.zeros(epochs)
train_loss = np.zeros(epochs)
val_acc = np.zeros(epochs)
val_loss = np.zeros(epochs)

best_val_acc = 0
for epoch in range(epochs):
    epoch_start = f'Epoch ({epoch + 1}/{epochs})'
    print(epoch_start, end=' ')
    
    # since train() returns nothing, so ignore the return with '_' and fetch the time taken
    _, _time = train()
    
    # evaluate the training accuracy and validation accuracy after epoch epoch
    train_acc[epoch], train_loss[epoch] = test(train_loader)
    val_acc[epoch], val_loss[epoch] = test(valid_loader)
    
    if val_acc[epoch] > best_val_acc:
        # save the best model according to validation accuracy
        best_val_acc = val_acc[epoch]
        torch.save(model, gnn_path)
    
    print(f'Train Acc: {train_acc[epoch]:.4f}, Train Loss: {train_loss[epoch]:>7.6f}', end=', ')
    print(f'Val Acc: {val_acc[epoch]:.4f}, Val Loss: {val_loss[epoch]:>7.6f}', end=' -- ')
    print(f'Training Time: {_time:.2f}s')

Epoch (1/20) Train Acc: 0.7865, Train Loss: 0.008178, Val Acc: 0.7808, Val Loss: 0.003963 -- Training Time: 1.67s
Epoch (2/20) Train Acc: 0.7868, Train Loss: 0.008174, Val Acc: 0.7808, Val Loss: 0.003963 -- Training Time: 1.37s
Epoch (3/20) Train Acc: 0.7866, Train Loss: 0.008176, Val Acc: 0.7808, Val Loss: 0.003963 -- Training Time: 1.38s
Epoch (4/20) 

KeyboardInterrupt: 