## Preprocess Synthetic Datasets

In [None]:
import os
import torch
from torch_geometric.data import InMemoryDataset

# taken from https://github.com/gravins/Anti-SymmetricDGN/blob/main/graph_prop_pred/utils/pna_dataset.py

TASKS = ['dist', 'ecc', 'lap', 'conn', 'diam', 'rad']
NODE_LVL_TASKS = ['dist', 'ecc', 'lap']
GRAPH_LVL_TASKS = ['conn', 'diam', 'rad']

class GraphPropDataset(InMemoryDataset):
    def __init__(self, root, split, task, dim='25-35', pre_transform=None):
        assert split in ['train', 'val', 'test']
        assert task in TASKS
        if not task in ['dist', 'ecc', 'diam']:
            raise NotImplementedError('the only tasks implemented are: dist, ecc, diam')

        assert dim in ['15-25', '25-35']
        self.dim = dim

        self.split = split
        self.task = task
        super().__init__(root)
        self.pre_transform = pre_transform
        self.data, self.slices = torch.load(self.processed_paths[0], weights_only=False)
        print(f'Loaded {self.processed_paths[0]}')

    @property
    def processed_paths(self):
        return [os.path.join(self.root, n) for n in self.processed_file_names]
        
    @property
    def processed_file_names(self):
        return [os.path.join(f'{self.split}_{self.task}_{self.dim}_data.pt')]

    def process(self):
        pass  # reuse the data already split by authors

In [2]:
if not os.path.exists('DATA/Diameter/raw'):
    os.makedirs('DATA/Diameter/raw')

if not os.path.exists('DATA/Eccentricity/raw'):
    os.makedirs('DATA/Eccentricity/raw')

if not os.path.exists('DATA/SSSP/raw'):
    os.makedirs('DATA/SSSP/raw')

In [3]:
for task in ['diam', 'dist', 'ecc']:
    data_list = []
    for split in ['train', 'val', 'test']:
        
        dataset = GraphPropDataset('DATA', split, task)
        data_list.extend([d for d in dataset])

    maps = {'diam': 'Diameter/raw',
           'ecc':  'Eccentricity/raw',
           'dist': 'SSSP/raw'}

    torch.save(data_list, os.path.join('DATA', maps[task], f'{task}_25-35_data_list.pt'))
    print(f'Task {task} preprocessed')

UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL torch_geometric.data.storage.GlobalStorage was not an allowed global by default. Please use `torch.serialization.add_safe_globals([GlobalStorage])` or the `torch.serialization.safe_globals([GlobalStorage])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

## Preprocess LRGB Datasets for PyDGN usage

In [None]:
import torch
import torch_geometric
from torch_geometric.datasets import LRGBDataset

peptides_func_tr = LRGBDataset('DATA', 'Peptides-func', split='train')
peptides_func_vl = LRGBDataset('DATA', 'Peptides-func', split='val')
peptides_func_te = LRGBDataset('DATA', 'Peptides-func', split='test')

peptides_struct_tr = LRGBDataset('DATA', 'Peptides-struct', split='train')
peptides_struct_vl = LRGBDataset('DATA', 'Peptides-struct', split='val')
peptides_struct_te = LRGBDataset('DATA', 'Peptides-struct', split='test')

In [None]:
peptides_func = [d for d in peptides_func_tr] + [d for d in peptides_func_vl] + [d for d in peptides_func_te]
peptides_struct = [d for d in peptides_struct_tr] + [d for d in peptides_struct_vl] + [d for d in peptides_struct_te]

In [None]:
len(peptides_func), len(peptides_struct)

(15535, 15535)

In [None]:
len(peptides_func_tr), len(peptides_func_vl), len(peptides_func_te)

(10873, 2331, 2331)

In [None]:
len(peptides_struct_tr), len(peptides_struct_vl), len(peptides_struct_te)

(10873, 2331, 2331)

In [None]:
from torch_geometric.transforms import AddLaplacianEigenvectorPE, AddRandomWalkPE

In [None]:
rwse = AddRandomWalkPE(walk_length=20)

print('Processing RWSE for peptides-func...')
i = 0
for d in peptides_func:
    rwse(d)

    i+=1 
    if i % 1000 == 0: 
        print(i)
        
print('Processing RWSE for peptides-struct...')
i = 0
for d in peptides_struct:
    rwse(d)

    i+=1 
    if i % 1000 == 0: 
        print(i)

Processing RWSE for peptides-func...


  C = torch.sparse.mm(A, B)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
Processing RWSE for peptides-struct...
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


In [None]:
import numpy as np
import torch.nn.functional as F
from torch_geometric.utils import (
    get_laplacian,
    to_scipy_sparse_matrix,
)

def add_node_attr(data, value, attr_name = None):
    # TODO Move to `BaseTransform`.
    if attr_name is None:
        if 'x' in data:
            x = data.x.view(-1, 1) if data.x.dim() == 1 else data.x
            data.x = torch.cat([x, value.to(x.device, x.dtype)], dim=-1)
        else:
            data.x = value
    else:
        data[attr_name] = value

    return data

# reproducing how LapPE are computed on GPS paper
class LapPE_GPS:
    # Number of nodes from which to use sparse eigenvector computation:
    SPARSE_THRESHOLD: int = 100

    def __init__(
        self,
        k: int,
        is_undirected: bool = False,
        **kwargs,
    ):
        self.k = k  # max_frequencies
        self.is_undirected = is_undirected
        self.kwargs = kwargs

    def forward(self, data):
        eps=1e-12
        num_nodes = data.num_nodes

        # GET LAPLACIAN
        edge_index, edge_weight = get_laplacian(
            data.edge_index,
            data.edge_weight,
            num_nodes=num_nodes,
        )
        L = to_scipy_sparse_matrix(edge_index, edge_weight, num_nodes)

        evals, evects = np.linalg.eigh(L.toarray())

        N = len(evals)  # Number of nodes, including disconnected nodes.
        assert N == num_nodes
        max_freqs = self.k
    
        # Keep up to the maximum desired number of frequencies.
        idx = evals.argsort()[:max_freqs]
        evals, evects = evals[idx], np.real(evects[:, idx])
        evals = torch.from_numpy(np.real(evals)).clamp_min(0)
        evects = torch.from_numpy(evects).float()

        # L2 NORMALIZATION
        denom = evects.norm(p=2, dim=0, keepdim=True)
        denom = denom.clamp_min(eps).expand_as(evects)
        evects = evects / denom

        # PADDING EIGENVECTORS
        if N < max_freqs:
            EigVecs = F.pad(evects, (0, max_freqs - N), value=float('nan'))
        else:
            EigVecs = evects
    
        # PADDING EIGENVALUES
        if N < max_freqs:
            EigVals = F.pad(evals, (0, max_freqs - N), value=float('nan')).unsqueeze(0)
        else:
            EigVals = evals.unsqueeze(0)
        EigVals = EigVals.repeat(N, 1).unsqueeze(2)

        data = add_node_attr(data, EigVecs, attr_name='laplacian_eigenvector_pe')
        data = add_node_attr(data, EigVals, attr_name='laplacian_eigenvalues_pe')
        return data

# errors = 0
print('Processing LapPE for peptides-func...')
i = 0
for d in peptides_func:
    lappe = LapPE_GPS(k=10)
    lappe.forward(d)
    
    i+=1 
    if i % 1000 == 0: 
        print(i)

print('Processing LapPE for peptides-struct `...')
i = 0
for d in peptides_struct:
    lappe = LapPE_GPS(k=10)
    lappe.forward(d)
    
    i+=1 
    if i % 1000 == 0: 
        print(i)                                      

Processing LapPE for peptides-func...
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
Processing LapPE for peptides-struct `...
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


In [None]:
# # Save processed data
torch.save(peptides_func, 'DATA/peptides-func/processed/data.pt')
torch.save(peptides_struct, 'DATA/peptides-struct/processed/data.pt')