<a href="https://colab.research.google.com/github/Zoro1092000/xxxx/blob/main/tmp_GIN_p2p_6k_feat_norm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I. Pip & Import.

In [2]:
import os
import os.path as osp
import torch
import sys

os.environ['TORCH'] = torch.__version__

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-geometric==1.4.3
!pip install torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install deepdish==0.3.5
!pip install torch-optimizer

import torch.nn as nn
from torch.nn import Parameter
from torch_geometric.utils import scatter_
from torch_geometric.nn.inits import glorot, zeros
from torch_scatter import scatter_add
from torch.utils.data import DataLoader, Dataset
import torch_optimizer as optimization


from itertools import chain
import pickle
import h5py
import deepdish as dd
import numpy as np
from tqdm import tqdm
import inspect
import time
import math
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.1+cu113.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
def time_since(start):
    now = time.time()
    s = now - start
    m = math.floor(s / 60)
    s -= m * 60
    h = math.floor(m / 60)
    m -= h * 60
    if h == 0:
        if m == 0:
            return '%ds' % s
        else:
            return '%dm %ds' % (m, s)
    else:
        return '%dh %dm %ds' % (h, m, s)


# II. Data

In [4]:
# Data utils
def h5group_to_dict(h5group):
    group_dict = {k: v[()] for k, v in chain(h5group.items(), h5group.attrs.items())}
    return group_dict

def sub_dict(full_dict, *keys, to_tensor):
    return {k: torch.tensor(full_dict[k]) if to_tensor else full_dict[k] for k in keys if k in full_dict}

def build_graph_from_dict_pyg(graph_dict, to_tensor=True):
    from torch_geometric.data import Data

    g = Data(**sub_dict(graph_dict, 'edge_index', 'x', 'y', 'edge_attr', 'edge_y', to_tensor=to_tensor))
    return g

# Data loader
class GraphDataLoader(DataLoader):
    def __init__(self, dataset, batch_size=128, shuffle=False, num_workers=0):

        def collate_graph(graph_obj_list):
            from torch_geometric.data import Batch
            batch = Batch.from_data_list(graph_obj_list)
            return batch

        super().__init__(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=collate_graph,
            num_workers=num_workers)

# BotnetDataset
class BotnetDataset(Dataset):

    def __init__(self, name='chord', root='data/botnet', split='train', graph_format='pyg', split_idx=None, add_nfeat_ones=True,
                 in_memory=True):
        super().__init__()
        assert name in ['chord', 'debru', 'kadem', 'leet', 'c2', 'p2p']
        assert split in ['train', 'val', 'test']

        if isinstance(root, str):
            root = osp.expanduser(osp.normpath(root))

        self.name = name
        self.root = root
        self.split = split
        self.split_idx = split_idx
        self.add_nfeat_ones = add_nfeat_ones

        self._graph_format = graph_format
        if split == 'train':
            self.path = self.processed_paths[0]
            self.num_graphs = 768
        elif split == 'val':
            self.path = self.processed_paths[1]
            self.num_graphs = 96
        elif split == 'test':
            self.path = self.processed_paths[2]
            self.num_graphs = 96

        if in_memory:
            self.data = dd.io.load(self.path)  # dictionary
            self.data_type = 'dict'
        else:
            # self.data = h5py.File(self.path, 'r')
            self.data = None    # defer opening file in each process to make multiprocessing work
            self.data_type = 'file'
            
    @property
    def processed_dir(self):
        return osp.join(self.root, 'processed')

    @property
    def processed_file_names(self):
        return [self.name + '_' + s + '.hdf5' for s in ('train', 'val', 'test')]

    @property
    def processed_paths(self):
        return [osp.join(self.processed_dir, f) for f in self.processed_file_names]

    def __len__(self):
        return self.num_graphs

    def __getitem__(self, index):
        if self.data_type == 'dict':
            graph_dict = self.data[str(index)]
        elif self.data_type == 'file':
            if self.data is None:
                # only open once in each process
                self.data = h5py.File(self.path, 'r')
            graph_dict = h5group_to_dict(self.data[str(index)])
        else:
            raise ValueError

        # graph_format == 'pyg':
        return build_graph_from_dict_pyg(graph_dict)


    def __iter__(self):
        for i in range(self.num_graphs):
            yield self[i]

    def __repr__(self):
        return f'{self.__class__.__name__}(topology: {self.name} | split: {self.split} | ' \
               f'#graphs: {len(self)} | graph format: {self.graph_format})'


# III. Measure Performancce

## 3.1. Metrics

In [5]:
def f1(target, pred, label):
    # F1 = 2 * (precision * recall) / (precision + recall)
    tp = np.sum((target==label) & (pred==label))
    fp = np.sum((target!=label) & (pred==label))
    fn = np.sum((pred!=label) & (target==label))
    
    if tp+fp==0 or tp+fn==0:
      return np.nan

    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    
    if precision+recall==0:
      return np.nan
      
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

def f1_macro(pred, target):
    return np.mean([f1(target, pred, label) for label in range(0, 2)])


def accuracy(pred, target):
    return (pred == target).sum().item() / len(target)


def true_positive(pred, target):
    return (target[pred == 1] == 1).sum().item()


def false_positive(pred, target):
    return (target[pred == 1] == 0).sum().item()


def true_negative(pred, target):
    return (target[pred == 0] == 0).sum().item()


def false_negative(pred, target):
    return (target[pred == 0] == 1).sum().item()


def recall(pred, target):
    try:
        return true_positive(pred, target) / (target == 1).sum().item()
    except:  # divide by zero
        return -1


def precision(pred, target):
    try:
        prec = true_positive(pred, target) / (pred == 1).sum().item()
        return prec
    except:  # divide by zero
        return -1


def f1_score(pred, target):
    prec = precision(pred, target)
    rec = recall(pred, target)
    try:
        return 2 * (prec * rec) / (prec + rec)
    except:
        return 0


def false_positive_rate(pred, target):
    try:
        return false_positive(pred, target) / (target == 0).sum().item()
    except:  # divide by zero
        return -1


def false_negative_rate(pred, target):
    try:
        return false_negative(pred, target) / (target == 1).sum().item()
    except:  # divide by zero
        return -1


## 3.2. Evaluation

In [6]:
def eval_metrics(target, pred_prob, threshold=0.5):
    if isinstance(target, torch.Tensor):
        target = target.cpu().numpy()
    if isinstance(pred_prob, torch.Tensor):
        pred_prob = pred_prob.cpu().numpy()

    pred = (pred_prob >= threshold).astype(int)

    acc = accuracy(pred, target)
    fpr = false_positive_rate(pred, target)
    fnr = false_negative_rate(pred, target)
    rec = recall(pred, target)
    prc = precision(pred, target)
    f1 = f1_score(pred, target)
    f1macro = f1_macro(pred, target)
    result_dict = {'acc': acc, 'fpr': fpr, 'fnr': fnr, 'rec': rec, 'prc': prc, 'f1': f1, 'f1_macro': f1macro}

    return result_dict


def dict_value_add(dict1, dict2):
    result = {key: dict1.get(key, 0) + dict2.get(key, 0)
              for key in set(dict1) | set(dict2)}
    return result


def dict_value_div(dict, n):
    result = {key: value / n for key, value in dict.items()}
    return result


def eval_predictor(dataset, predictor):
    result_dict_avg = {}
    loss_avg = 0

    for data in dataset:
        # prediction
        try:
            pred_prob, loss = predictor(data)
            loss_avg += loss
        except ValueError:  # if "too many values to unpack"
            pred_prob = predictor(data)

        # get the ground truth target
        # graph_format == 'pyg':
        target = data.y

        # compute the evaluation metrics
        result_dict = eval_metrics(target, pred_prob)

        result_dict_avg = dict_value_add(result_dict_avg, result_dict)

    # average the metrics across all graphs in the dataset as final results
    result_dict_avg = dict_value_div(result_dict_avg, len(dataset))
    loss_avg = loss_avg / len(dataset)

    return result_dict_avg, loss_avg


# =================================================================================================================
# some examples of the 'predictor' model wrapper to be fed into the above evaluation function (for PyG Data format)
# =================================================================================================================
class PygRandomPredictor:
    def __init__(self):
        # torch.manual_seed(0)
        pass

    def __call__(self, data):
        pred_prob = torch.rand(len(data.y))
        return pred_prob


class PygModelPredictor:
    def __init__(self, model, loss_fcn=torch.nn.CrossEntropyLoss()):
        self.model = model
        self.loss_fcn = loss_fcn
        self.device = next(model.parameters()).device

    def __call__(self, data):
        self.model.eval()
        data = data.to(self.device)
        with torch.no_grad():
            # custom the below line to adjust to your model's input format for forward pass
            out = self.model(data.x, data.edge_index)
            loss = self.loss_fcn(out, data.y.long())
            pred_prob = torch.softmax(out, dim=1)[:, 1]
        return pred_prob, loss.float()


# IV. Model

## 5.1. Activation 

In [7]:
def activation(act, negative_slope=0.2):
    activations = nn.ModuleDict([
        ['lrelu', nn.LeakyReLU(negative_slope)],
        ['relu', nn.ReLU()],
        ['elu', nn.ELU()],
        ['none', nn.Identity()],
    ])
    return activations[act]

## 5.2. GIN model

In [8]:
import torch.nn as nn

class GINModel(nn.Module):
    def __init__(self, dim_input_feature, dim_hidden_feature, num_layers, num_classes, non_linear='relu', dropout=0.2):
        super().__init__()

        self.dim_input_feature = dim_input_feature
        self.dim_hidden_feature = dim_hidden_feature

        self.num_layers = num_layers
        self.num_classes = num_classes
        
        from torch.nn import Linear, Sequential, BatchNorm1d, ReLU, Dropout
        from torch_geometric.nn import GINConv, ARMAConv
        
        self.gin_net = nn.ModuleList()
        for i in range(self.num_layers):
          if i == 0:
            self.gin_net.append(
                GINConv(
                  Sequential(Linear(self.dim_input_feature, self.dim_hidden_feature),
                             BatchNorm1d(self.dim_hidden_feature), ReLU(),
                             Linear(self.dim_hidden_feature, self.dim_hidden_feature), ReLU())))
          else:
            self.gin_net.append(
                GINConv(
                  Sequential(Linear(self.dim_hidden_feature, self.dim_hidden_feature),
                             BatchNorm1d(self.dim_hidden_feature), ReLU(),
                             Linear(self.dim_hidden_feature, self.dim_hidden_feature), ReLU()))) 

        self.dropout = nn.Dropout(dropout)

        self.residuals = nn.ModuleList()
        for i in range(self.num_layers):
          if i == 0:
            self.residuals.append(nn.Linear(self.dim_input_feature, self.dim_hidden_feature, bias=True))
          else:
            self.residuals.append(nn.Identity())

        self.num_residuals = len(self.residuals)

        self.non_linear = activation(non_linear)

        # self.final_type == 'proj':
        self.final = nn.Linear(self.dim_hidden_feature, num_classes)

    def reset_parameters(self):
        for net in self.gin_net:
            net.reset_parameters()
        # self.residual_hop = 1
        for net in self.residuals:
            net.reset_parameters()
        # self.final_type != 'none':
        self.final.reset_parameters()

    def forward(self, x, edge_index):
        xr = None
        add_xr_at = -1

        for n, net in enumerate(self.gin_net):
            # pass to a GIN layer with non-linear activation
            xo = net(x, edge_index)
            xo = self.dropout(xo)
            # deal with residual connections
            # self.residual_hop = 1
            if n < self.num_residuals:
                xr = self.residuals[n](x)
                add_xr_at = n
            if n == add_xr_at:
                xo = self.non_linear(xo + xr)

            x = xo
        # size of x: (B * N, dim_hidden_feature) -> (B * N, num_classes)
        x = self.final(x)

        return x


# V. Load data

In [9]:
data_dir = '/content/drive/Shareddrives/botnetdata/p2p_6k_feat_norm'
data_name = 'p2p' # 'chord', 'debru', 'kadem', 'leet', 'c2', 'p2p'
shuffle = False

# ========== load the dataset
print('loading dataset...')

train_ds = BotnetDataset(name=data_name, root=data_dir, split='train',
                         in_memory=False, graph_format='pyg')
val_ds = BotnetDataset(name=data_name, root=data_dir, split='val',
                       in_memory=False, graph_format='pyg')
test_ds = BotnetDataset(name=data_name, root=data_dir, split='test',
                        in_memory=False, graph_format='pyg')


loading dataset...


In [10]:
batch_size = 2
train_loader = GraphDataLoader(train_ds, batch_size=batch_size, shuffle=bool(shuffle), num_workers=0)


# VI. Train

In [None]:
# ============== some default parameters =============
devid = 0
seed = 0
logmode = 'w'
log_interval = 96

dim_input_feature = 1
dim_hidden_feature = 32
act = 'relu' # 'none', 'lrelu', 'relu', 'elu'

num_layers = 12
num_classes = 2 

dropout = 0.0
bias = True

lr = 0.005 # learning rate
weight_decay = 5e-4
epochs = 50
save_dir = './saved_models'
save_name = "GIN_model.pt"
# ====================================================

def train(model, train_loader, val_dataset, test_dataset, optimizer, scheduler=None):
    device = next(model.parameters()).device
    predictor = PygModelPredictor(model)

    best_epoch = 0
    max_f1_score = 0
    start = time.time()
    for ep in range(epochs):
        loss_avg_train = 0
        num_train_graph = 0
        model.train()
        for n, batch in enumerate(train_loader):
            batch.to(device)

            optimizer.zero_grad()

            x = model(batch.x, batch.edge_index)
            loss = criterion(x, batch.y.long())

            loss_avg_train += float(loss)
            num_train_graph += batch.num_graphs

            loss.backward()
            optimizer.step()

            if num_train_graph % log_interval == 0 or n == len(train_loader) - 1:
                with torch.no_grad():
                    # pred = x.argmax(dim=1)
                    pred_prob = torch.softmax(x, dim=1)[:, 1]
                    y = batch.y.long()
                    result_dict = eval_metrics(y, pred_prob)
                print(f'epoch: {ep + 1}, passed number of graphs: {num_train_graph}, '
                        f'train running loss: {loss_avg_train / num_train_graph:.5f} (passed time: {time_since(start)})')
                print(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict.items()]))

        result_dict_avg, loss_avg = eval_predictor(val_dataset, predictor)
        print(f'Validation --- epoch: {ep + 1}, loss: {loss_avg:.5f}')
        print(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict_avg.items()]))

        if scheduler is not None:
            scheduler.step(loss_avg)

        if result_dict_avg['f1'] > max_f1_score:
            save_name = f"GIN_{ep}: {result_dict_avg['f1']}.pt"
            torch.save(model, os.path.join(save_dir, save_name))
            print(f'Better model saved at {os.path.join(save_dir, save_name)}.')
            best_epoch = ep
            max_f1_score = result_dict_avg['f1']

    best_model = torch.load(os.path.join(save_dir, save_name))
    print('*' * 12 + f' best model obtained after epoch {best_epoch + 1}, '
                       f'saved at {os.path.join(save_dir, save_name)} ' + '*' * 12)
    
    predictor = PygModelPredictor(best_model)

    result_dict_avg, loss_avg = eval_predictor(test_dataset, predictor)
    print(f'Testing --- loss: {loss_avg:.5f}')
    print(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict_avg.items()]))


if __name__ == '__main__':
    os.makedirs(save_dir, exist_ok=True)

    # ========== random seeds and device
    random.seed(seed)
    torch.manual_seed(seed)

    device = torch.device(f'cuda:{devid}') if devid > -1 else torch.device('cpu')

    # ========== logging setup
    log_name = os.path.splitext(save_name)[0]
    # logger = logging_config(__name__, folder=save_dir, name=log_name, filemode=logmode)
    # logger = logging_config(os.path.basename(__file__), folder=save_dir, name=log_name, filemode=logmode)

    print('python ' + ' '.join(sys.argv))
    print('-' * 30)
    #logger.info(args)
    print('-' * 30)
    print(time.ctime())
    print('-' * 30)

    # ========== define the model, optimizer, and loss

    model = GINModel(dim_input_feature,
                     dim_hidden_feature,
                     num_layers,
                     num_classes,
                     non_linear=act,
                     dropout=dropout)

    print('model ' + '-' * 10)
    print(repr(model))
    model.to(device)

    class_weight = torch.Tensor([0.53, 12])
    class_weight = class_weight.cuda(device=0)
    criterion = torch.nn.CrossEntropyLoss(weight=class_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.25, patience=1)

    # ========== train the model
    print(f"Batch size: {batch_size}")
    print(class_weight)
    train(model, train_loader, val_ds, test_ds, optimizer, scheduler)


python /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py -f /root/.local/share/jupyter/runtime/kernel-c190880a-c5a1-4615-8daf-ea6564d8634a.json
------------------------------
------------------------------
Fri Nov  4 04:16:44 2022
------------------------------
model ----------
GINModel(
  (gin_net): ModuleList(
    (0): GINConv(nn=Sequential(
      (0): Linear(in_features=1, out_features=32, bias=True)
      (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=32, out_features=32, bias=True)
      (4): ReLU()
    ))
    (1): GINConv(nn=Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=32, out_features=32, bias=True)
      (4): ReLU()
    ))
    (2): GINConv(nn=Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
  

In [None]:
!zip -r /content/GIN_053_12_Adam_P2P_6k_feat_norm_12Layer.zip /content/saved_models

In [None]:
from google.colab import files
files.download("/content/GIN_053_12_Adam_P2P_6k_feat_norm_12Layer.zip")