In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.2.1+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!wget -O malnet-graphs-tiny.tar.gz http://malnet.cc.gatech.edu/graph-data/malnet-graphs-tiny.tar.gz

--2024-03-17 20:45:48--  http://malnet.cc.gatech.edu/graph-data/malnet-graphs-tiny.tar.gz
Resolving malnet.cc.gatech.edu (malnet.cc.gatech.edu)... 130.207.126.102
Connecting to malnet.cc.gatech.edu (malnet.cc.gatech.edu)|130.207.126.102|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42204344 (40M) [application/octet-stream]
Saving to: ‘malnet-graphs-tiny.tar.gz’


2024-03-17 20:45:49 (117 MB/s) - ‘malnet-graphs-tiny.tar.gz’ saved [42204344/42204344]



In [None]:
import shutil
source_path = '/content/malnet-graphs-tiny.tar.gz'
destination_path = '/content/drive/My Drive/Explainability Methods/DATASETs/Datasets/malnet-graphs-tiny.tar.gz'

shutil.move(source_path, destination_path)

'/content/drive/My Drive/Explainability Methods/DATASETs/Datasets/malnet-graphs-tiny.tar.gz'

In [None]:
import tarfile
with tarfile.open(destination_path, 'r:gz') as tar:
    tar.extractall(path='/content/drive/My Drive/Explainability Methods/DATASETs/Datasets/')

In [None]:
import os
import os.path as osp
import torch
from torch_geometric.data import Dataset, Data
from typing import Callable, Dict, List, Optional
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
    extract_tar,
    extract_zip,
)

class MalNetTiny(InMemoryDataset):

    data_url = ('http://malnet.cc.gatech.edu/'
                'graph-data/malnet-graphs-tiny.tar.gz')
    split_url = 'http://malnet.cc.gatech.edu/split-info/split_info_tiny.zip'
    splits = ['train', 'val', 'test']

    def __init__(self, root: str, split: Optional[str] = None, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None,
                 pre_filter: Optional[Callable] = None, force_reload: bool = False) -> None:
        if split not in {'train', 'val', 'trainval', 'test', None}:
            raise ValueError(f'Split "{split}" found, but expected either '
                             f'"train", "val", "trainval", "test" or None')
        super().__init__(root, transform, pre_transform, pre_filter, force_reload=force_reload)
        self.load(self.processed_paths[0])

        if split is not None:
            split_slices = torch.load(self.processed_paths[1])
            if split == 'train':
                self._indices = range(split_slices[0], split_slices[1])
            elif split == 'val':
                self._indices = range(split_slices[1], split_slices[2])
            elif split == 'trainval':
                self._indices = range(split_slices[0], split_slices[2])
            elif split == 'test':
                self._indices = range(split_slices[2], split_slices[3])

    @property
    def raw_file_names(self) -> List[str]:
        return ['malnet-graphs-tiny', osp.join('split_info_tiny', 'type')]

    @property
    def processed_file_names(self) -> List[str]:
        return ['data.pt', 'split_slices.pt']

    def download(self) -> None:
        path = download_url(self.data_url, self.raw_dir)
        extract_tar(path, self.raw_dir)
        os.unlink(path)

        path = download_url(self.split_url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        os.unlink(path)

    def process(self) -> None:
        y_map: Dict[str, int] = {}
        data_list = []
        split_slices = [0]

        for split in ['train', 'val', 'test']:
            with open(osp.join(self.raw_paths[1], f'{split}.txt'), 'r') as f:
                filenames = f.read().split('\n')[:-1]
                split_slices.append(split_slices[-1] + len(filenames))

            for filename in filenames:
                path = osp.join(self.raw_paths[0], f'{filename}.edgelist')
                malware_type = filename.split('/')[0]
                y = y_map.setdefault(malware_type, len(y_map))

                with open(path, 'r') as f:
                    edges = f.read().split('\n')[5:-1]

                edge_indices = [[int(s) for s in e.split()] for e in edges]
                edge_index = torch.tensor(edge_indices).t().contiguous()
                num_nodes = int(edge_index.max()) + 1
                data = Data(edge_index=edge_index, y=y, num_nodes=num_nodes)
                data_list.append(data)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        self.save(data_list, self.processed_paths[0])
        torch.save(split_slices, self.processed_paths[1])

In [None]:
dataset = MalNetTiny(root='/content/drive/My Drive/Explainability Methods/DATASETs/Datasets/')
data = dataset[0]  # Retrieve the first data sample


Downloading http://malnet.cc.gatech.edu/graph-data/malnet-graphs-tiny.tar.gz
Extracting /content/drive/My Drive/Explainability Methods/DATASETs/Datasets/raw/malnet-graphs-tiny.tar.gz
Downloading http://malnet.cc.gatech.edu/split-info/split_info_tiny.zip
Extracting /content/drive/My Drive/Explainability Methods/DATASETs/Datasets/raw/split_info_tiny.zip
Processing...
Done!


In [None]:
print(len(dataset), dataset[0])

5000 Data(edge_index=[2, 3576], y=[1], num_nodes=1679)


In [None]:
print(dataset[0].x)

tensor([[0.6005],
        [0.7350],
        [0.5124],
        [0.7634],
        [0.5810],
        [0.7941],
        [0.7025],
        [0.6235],
        [0.7882],
        [0.5838],
        [0.5105],
        [0.2810],
        [0.5782],
        [0.7797],
        [0.4412],
        [0.6409],
        [0.2843],
        [0.7843],
        [0.6980],
        [0.5314],
        [0.8451],
        [0.7182],
        [0.2941],
        [0.6033],
        [0.8843],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0

In [None]:
print("Number of Graphs: ", len(dataset))
print("Number of Node Features: ", dataset.num_node_features)
print("Number of Edge Features: ", dataset.num_edge_features)
print("Number of Classes: ", dataset.num_classes)


num_edges=0
for i in range(len(dataset)):
    num_edges = num_edges + len(dataset[i].edge_index[0])
print("Number of Edges: ", num_edges)


num_nodes = 0
for i in range(len(dataset)):
    num_nodes = num_nodes + dataset[i].num_nodes
print("Number of Nodes in the Dataset: ", num_nodes)
#print(dataset.sizes)


Number of Graphs:  5000
Number of Node Features:  0
Number of Edge Features:  0
Number of Classes:  5
Number of Edges:  14299744
Number of Nodes in the Dataset:  7610303


In [None]:
from tabulate import tabulate

In [None]:
col_names = ["Dataset", "#Graphs", "#NFeatures", "#Classes", "(sum. )#Nodes", "(sum. )#Edges", "(avg.)#nodes/graph", "(avg.)#edges/graph", "(avg.) #node-features/ graph", "#EFeatures"]
data = [["MNIST 75", len(dataset), dataset.num_node_features, dataset.num_classes, num_nodes, num_edges, num_nodes/len(dataset), num_edges/len(dataset), 1, dataset.num_edge_features]]
#print(dataset[0].edge_attr)
#print(len(col_names))
#print(len(data[0]))
print(tabulate(data, headers=col_names))

Dataset      #Graphs    #NFeatures    #Classes    (sum. )#Nodes    (sum. )#Edges    (avg.)#nodes/graph    (avg.)#edges/graph    (avg.) #node-features/ graph    #EFeatures
---------  ---------  ------------  ----------  ---------------  ---------------  --------------------  --------------------  ------------------------------  ------------
MNIST 75        5000             0           5          7610303         14299744               1522.06               2859.95                               1             0


In [None]:
print("Avg number of nodes: ", num_nodes/len(dataset))

Avg number of nodes:  23.926967619948


In [None]:
print("Avg number of edges: ", num_edges/len(dataset))

Avg number of edges:  49.702749546994404
