In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from datetime import datetime
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch_geometric as pyg
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from models import GCN, TorchTrainer
from utils import read_pickle, split_df, get_file_paths, DataProcessor

print('Versions')
print('-' * 20)
print(f'{"Pytorch":<20} {torch.__version__}')
print(f'{"Pytorch-Geometric":<20} {pyg.__version__}')
print('-' * 20)
print(f'{"GPU Support":<20} {torch.cuda.is_available()}')

Versions
--------------------
Pytorch              1.13.1+cu117
Pytorch-Geometric    2.2.0
--------------------
GPU Support          True


In [2]:
detector = True

data_info = 'dataset.csv'
fcg_dir = 'FCGs/'
w2v_path = 'model_saved/word2vec.wordvectors'
gnn_path = 'model_saved/gcn.pt'

fpaths = get_file_paths(fcg_dir)
fnames = [os.path.splitext(os.path.basename(path))[0] for path in fpaths]
exist_df = pd.DataFrame({'sha256': fnames, 'path': fpaths})

islab_names = {'sha256': str, 'family': str, 'threshold': str, 'arch': str}
islab_df = pd.read_csv(data_info, low_memory=False, names=islab_names, dtype=islab_names, skiprows=1)
islab_df = islab_df[~islab_df.family.isin(['Unknown'])]
ds_df = islab_df[islab_df.arch.isin(['x86el', 'x86_64el'])]
ds_df = exist_df.merge(ds_df, how='inner')

mal_dict = {fam: int(fam!='BenignWare') for fam in ds_df.family.unique()}
family_dict = {fam: i for i, fam in enumerate(ds_df.family.unique())}

ds_df = ds_df.assign(label=ds_df.family.map(family_dict))
ds_df = ds_df.assign(malicious=ds_df.family.map(mal_dict))
ds_df = ds_df.reset_index(drop=True)
ds_df.head()

Unnamed: 0,sha256,path,family,threshold,arch,label,malicious
0,1263d19ea264fd53e7d21755752b37101ba247ed6d0e24...,FCGs/mal_graphs/1263d19ea264fd53e7d21755752b37...,Mirai,True,x86el,0,1
1,48309f7ef98e9597eedacc02fba7ba3761e6f00712adbb...,FCGs/mal_graphs/48309f7ef98e9597eedacc02fba7ba...,Mirai,True,x86el,0,1
2,f399487df0dd96d02f6a4a95b1dd4f6e1934d92463d06f...,FCGs/mal_graphs/f399487df0dd96d02f6a4a95b1dd4f...,Mirai,True,x86el,0,1
3,a9451891dd42875fb275a14cf7b5970d3de488f6557d12...,FCGs/mal_graphs/a9451891dd42875fb275a14cf7b597...,Bashlite,True,x86el,1,1
4,80dca6a3359ca5becb1d1bf0cf405249b6200caa2c97bd...,FCGs/mal_graphs/80dca6a3359ca5becb1d1bf0cf4052...,Mirai,True,x86el,0,1


In [3]:
ds_df.family.value_counts()

BenignWare    16649
Bashlite      12541
Mirai         11552
Android        1993
Tsunami        1402
Xorddos         603
Dofloo          594
Pnscan           13
Name: family, dtype: int64

In [4]:
processor = DataProcessor()
processor.load(w2v_path)

def get_data(path, label=None):
    G = read_pickle(path)
    if len(G.edges) == 0:
        # will be filtered out later
        return None
    
    x, edge_index = processor.from_networkx(G)
    if label is not None:
        return Data(x=x, edge_index=edge_index, y=label)
    return Data(x=x, edge_index=edge_index)

In [5]:
num_train = 0.8
# num_valid = 2000
workers = 20
column = 'malicious' if detector else 'label'

train_df, valid_df = split_df(df=ds_df,    n_or_frac=num_train, column=column, shuffle=True, allow_lower_n=True)
# valid_df, test_df  = split_df(df=valid_df, n_or_frac=num_valid, column=column, shuffle=True, allow_lower_n=True)
train_pairs = train_df[['path', column]].to_numpy()
valid_pairs = valid_df[['path', column]].to_numpy()

print('Processing Training ...')
train_ds = Parallel(n_jobs=workers)(delayed(get_data)(path, label) for path, label in tqdm(train_pairs))
train_ds = [data for data in train_ds if not data is None]
print('Processing Validation ...')
valid_ds = Parallel(n_jobs=workers)(delayed(get_data)(path, label) for path, label in tqdm(valid_pairs))
valid_ds = [data for data in valid_ds if not data is None]

print(f'Train in total: {len(train_ds)}')
print(f'Valid in total: {len(valid_ds)}')

Processing Training ...


100%|██████████| 36277/36277 [24:25<00:00, 24.76it/s]


Processing Validation ...


100%|██████████| 9070/9070 [05:25<00:00, 27.85it/s]


Train in total: 35281
Valid in total: 8847


In [8]:
epochs     = 20
batch_size = 64
device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_features = train_ds[0].num_node_features
num_classes  = len(ds_df[column].unique())
train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, drop_last=True, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=128, num_workers=0, drop_last=False)

In [16]:
model = GCN(num_features=num_features, hidden_channels=64, num_classes=num_classes).to(device)

# define device of model before sending to the optimizer model.parameters() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()

print(f'Device: {device}\n')
print(f'Model: \n{model}\n')
print(f'Optimizer: \n{optimizer}\n')
print(f'Criterion: {criterion}\n')

Device: cuda

Model: 
GCN(
  (conv1): GCNConv(128, 64)
  (conv2): GCNConv(64, 64)
  (lin1): Linear(in_features=64, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=128, bias=True)
  (lin): Linear(in_features=128, out_features=2, bias=True)
)

Optimizer: 
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: False
    lr: 0.0001
    maximize: False
    weight_decay: 0
)

Criterion: CrossEntropyLoss()



In [17]:
trainer = TorchTrainer(model, optimizer=optimizer, criterion=criterion, device=device)
trainer.train(train_loader=train_loader, 
              valid_loader=valid_loader, 
              epochs=20, 
              save_path='model_saved/gcn_model.pt', 
              verbose=True)

Epoch (1/20) Train Acc: 0.9040, Train Loss: 0.006430, Val Acc: 0.9055, Val Loss: 0.003211 -- Training Time: 3.97s
Epoch (2/20) Train Acc: 0.9112, Train Loss: 0.006277, Val Acc: 0.9135, Val Loss: 0.003130 -- Training Time: 3.61s
Epoch (3/20) Train Acc: 0.9155, Train Loss: 0.006201, Val Acc: 0.9167, Val Loss: 0.003101 -- Training Time: 3.67s
Epoch (4/20) Train Acc: 0.9162, Train Loss: 0.006188, Val Acc: 0.9172, Val Loss: 0.003096 -- Training Time: 3.85s
Epoch (5/20) Train Acc: 0.9168, Train Loss: 0.006179, Val Acc: 0.9179, Val Loss: 0.003092 -- Training Time: 2.95s
Epoch (6/20) Train Acc: 0.9162, Train Loss: 0.006186, Val Acc: 0.9160, Val Loss: 0.003101 -- Training Time: 2.59s
Epoch (7/20) Train Acc: 0.9150, Train Loss: 0.006206, Val Acc: 0.9151, Val Loss: 0.003114 -- Training Time: 2.53s
Epoch (8/20) Train Acc: 0.9775, Train Loss: 0.005226, Val Acc: 0.9811, Val Loss: 0.002596 -- Training Time: 2.88s
Epoch (9/20) Train Acc: 0.9840, Train Loss: 0.005123, Val Acc: 0.9861, Val Loss: 0.00255

In [18]:
trainer.test(train_loader)

(0.9885539437896645, 0.005053608490834697)

In [38]:
trainer.test(valid_loader)

(0.990036231884058, 0.0025243777680494213)

### prediction

In [13]:
from utils import compute_metrices

def evaluate(loader, model, device='cuda'):
    preds = []
    label = []
    with torch.no_grad():
        for data in loader:
            data.to(device)
            pred = model(data).cpu().detach().argmax(dim=1)
            
            label.append(data.y.cpu().detach())
            preds.append(pred)
    preds = torch.concat(preds)
    label = torch.concat(label)
    return preds, label

In [14]:
model = GCN(num_features=num_features, hidden_channels=64, num_classes=num_classes).to(device)
trainer = TorchTrainer(model)
trainer.load('model_saved/gcn_model.pt')

In [16]:
preds, labels = evaluate(train_loader, trainer.model)
compute_metrices(labels, preds)

{'accuracy': 0.9898763611615246,
 'precision': 0.994335054136781,
 'recall': 0.9894081280116375,
 'f1': 0.9918654726912297}

In [17]:
preds, labels = evaluate(valid_loader, trainer.model)
compute_metrices(labels, preds)

{'accuracy': 0.988357635356618,
 'precision': 0.9929142441860465,
 'recall': 0.9884246699222282,
 'f1': 0.9906643705247892}

### Adv

In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from datetime import datetime
from joblib import Parallel, delayed

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch_geometric as pyg
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from models import GCN, TorchTrainer
from utils import read_pickle, DataProcessor

In [3]:
hsin_ae_dir = 'FCGs/ae_graphs/Hsin'
w2v_path = 'model_saved/word2vec.wordvectors'
gnn_path = 'model_saved/gcn_model.pt'

fpaths = [os.path.join(hsin_ae_dir, f) for f in os.listdir(hsin_ae_dir)]
processor = DataProcessor()
processor.load(w2v_path)

def get_data(path, label=None):
    G = read_pickle(path)
    if len(G.edges) == 0:
        # will be filtered out later
        return None
    
    x, edge_index = processor.from_networkx(G)
    if label is not None:
        return Data(x=x, edge_index=edge_index, y=label)
    return Data(x=x, edge_index=edge_index)

In [5]:
len(fpaths)

147

In [4]:
workers = 20

print('Processing AEs ...')
ae_ds = Parallel(n_jobs=workers)(delayed(get_data)(path) for path in tqdm(fpaths))
ae_ds = [data for data in ae_ds if not data is None]

Processing AEs ...


100%|██████████| 147/147 [00:07<00:00, 19.18it/s]


In [4]:
epochs     = 20
batch_size = 64
device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_features = ae_ds[0].num_node_features
# num_classes  = len(ds_df[column].unique())
num_classes  = 2
ae_loader = DataLoader(ae_ds, batch_size=batch_size, num_workers=0, drop_last=False)

model = GCN(num_features=num_features, hidden_channels=64, num_classes=num_classes).to(device)
trainer = TorchTrainer(model)
trainer.load('model_saved/gcn_model.pt')

In [16]:
preds = trainer.predict(ae_loader).argmax(dim=1)
asr = 1 - preds.sum() / preds.size(0)
asr.numpy()

tensor(0.8837)