# Load local dataset

In [4]:
import importlib
import random
import argparse
import configparser
import numpy as np
import networkx as nx
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_sparse
from torch import Tensor
from torch.nn import Linear
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.optim as optim

from torch_geometric.utils import negative_sampling, to_networkx
from typing import Union, Tuple
from torch_geometric.typing import OptPairTensor, Adj, OptTensor, Size
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn.conv import MessagePassing

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator


import networkx as nx
import seaborn as sns
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

import scipy
import math


from dataset_utils import node_feature_utils
from dataset_utils.node_feature_utils import *
import my_utils as utils

importlib.reload(utils)



<module 'my_utils' from '/li_zhengdao/github/GenerativeGNN/my_utils.py'>

In [5]:
# Load specific dataset:

import sys,os
sys.path.append(os.getcwd())


from PrepareDatasets import DATASETS
import my_utils
import dataset_utils


print(DATASETS.keys())
"""
    'REDDIT-BINARY': RedditBinary,
    'REDDIT-MULTI-5K': Reddit5K,
    'COLLAB': Collab,
    'IMDB-BINARY': IMDBBinary,
    'IMDB-MULTI': IMDBMulti,
    'ENZYMES': Enzymes,
    'PROTEINS': Proteins,
    'NCI1': NCI1,
    'DD': DD,
    "MUTAG": Mutag,
    'CSL': CSL
"""

data_names = ['PROTEINS']
data_names = ['DD']
data_names = ['ENZYMES']
data_names = ['NCI1']
data_names = ['IMDB-MULTI']
data_names = ['REDDIT-BINARY']
data_names = ['CIFAR10']
data_names = ['ogbg_molhiv']


# NOTE:new kernel:
data_names = ['MUTAG']
data_names = ['DD', 'PROTEINS', 'ENZYMES']

data_names = ['ogbg_moltox21','ogbg-molbace']
data_names = []

datasets_obj = {}
for k, v in DATASETS.items():
    if k not in data_names:
        continue
    
    print('loaded dataset, name:', k)
    dat = v(use_node_attrs=True)
    datasets_obj[k] = dat
    # print(type(dat.dataset.get_data()))

dict_keys(['REDDIT-BINARY', 'REDDIT-MULTI-5K', 'COLLAB', 'IMDB-BINARY', 'IMDB-MULTI', 'NCI1', 'AIDS', 'ENZYMES', 'PROTEINS', 'DD', 'MUTAG', 'CSL', 'CIFAR10', 'MNIST', 'PPI', 'hiv', 'bace', 'bbpb', 'ogbg_molhiv', 'ogbg_ppa', 'PTC', 'QM9', 'ogbg_moltox21', 'ogbg-molbbbp', 'ogbg-molbace', 'syn_cc', 'syn_degree'])


In [6]:
# load syn cc datasets:


def get_new_config():
    return {'model': 'GIN', 'device': 'cuda:0', 'batch_size': 128, 'learning_rate': 0.001, 'classifier_epochs':
    200, 'hidden_units': [64, 300, 300, 64], 'layer_num': 5, 'optimizer': 'Adam', 
    'scheduler': {'class': 'StepLR', 'args': {'step_size': 50, 'gamma': 0.5}}, 
    'loss': 'MulticlassClassificationLoss', 'train_eps': False, 'l2': 0.0, 'aggregation': 'mean', 'gradient_clipping': None, 
    'dropout': 0.5, 'early_stopper': {'class': 'Patience', 'args': {'patience': 30, 'use_loss': False}},
    'shuffle': True, 'resume': False,
    'additional_features': 'degree', 'node_attribute': False,
    'shuffle_feature': False, 'roc_auc': True, 'use_10_fold': True, 
    'mol_split': False, 'dataset': 'syn_degree', 
    'config_file': 'gnn_comparison/config_GIN_degree.yml', 
    'experiment': 'endtoend', 
    'result_folder': 'results/result_0530_GIN_degree_syn_degree_0.1_class2', 
    'dataset_name': 'syn_degree', 'dataset_para': '0.1_class2', 'outer_folds': 10, 
    'outer_processes': 2, 'inner_folds': 5, 'inner_processes': 1, 'debug': True, 'ogb_evl': False, 
    'model_name': 'GIN', 'device': 'cuda:0', 'batch_size': 128,
    'learning_rate': 0.001, 'classifier_epochs': 200,
    'hidden_units': [64, 300, 300, 64], 'layer_num': 5,
    'train_eps': False, 'l2': 0.0,
    'aggregation': 'mean',
    'gradient_clipping': None, 'dropout': 0.5, 
    'shuffle': True, 'resume': False, 'additional_features': 'degree',
    'node_attribute': False, 
    'shuffle_feature': False, 'roc_auc': True, 'use_10_fold': True, 
    'mol_split': False, 'dataset_name': 'syn_degree', 
    'experiment': 'endtoend', 'result_folder': 'results/result_0530_GIN_degree_syn_degree_0.1_class2',
    'dataset_para': '0.1_class2', 'outer_folds': 10, 
    'outer_processes': 2, 'inner_folds': 5, 'inner_processes': 1, 'debug': True, 'ogb_evl': False}


cc_datasets = []
for i in range(1, 10):
    configs = get_new_config()
    corrs = round(i/10.0, 1)
    configs['dataset_para'] = f'{corrs}_class5'
    cc_datasets.append(DATASETS['syn_cc'](config=configs))



#

processed_dir:  DATA/syn_cc/processed
load dataset !
SynDataset load data_path: DATA/syn_cc_0.1_class5.pkl
!!!! _dim_target:  5
dataset len:  4096
load splits: DATA/syn_cc/processed/syn_cc_0.1_class5_splits.json
split counts: 10
processed_dir:  DATA/syn_cc/processed
load dataset !
SynDataset load data_path: DATA/syn_cc_0.2_class5.pkl
!!!! _dim_target:  5
dataset len:  4096
load splits: DATA/syn_cc/processed/syn_cc_0.2_class5_splits.json
split counts: 10
processed_dir:  DATA/syn_cc/processed
load dataset !
SynDataset load data_path: DATA/syn_cc_0.3_class5.pkl
!!!! _dim_target:  5
dataset len:  4096
load splits: DATA/syn_cc/processed/syn_cc_0.3_class5_splits.json
split counts: 10
processed_dir:  DATA/syn_cc/processed
load dataset !
SynDataset load data_path: DATA/syn_cc_0.4_class5.pkl
!!!! _dim_target:  5
dataset len:  4096
load splits: DATA/syn_cc/processed/syn_cc_0.4_class5_splits.json
split counts: 10
processed_dir:  DATA/syn_cc/processed
load dataset !
SynDataset load data_path: DATA

In [7]:
# use mlp to train:
import pickle as pk
ccpath = '/li_zhengdao/github/GenerativeGNN/DATA/syn_cc/processed/graphwise_syn_cc_0.9_class5_add_avg_cc.pkl'

with open(ccpath, 'rb') as f:
    cc = pk.load(f)
    print(len(cc))
    print(cc[4000])

4096
[0.29738563]


In [11]:
# check the balance of the dataset:

d = cc_datasets[-1]

labels = []
for i in d.dataset:
    labels.append(i.y.item())
    
from collections import Counter

print(Counter(labels))

print((1477+1182+1125)/4096.0)


Counter({2: 1477, 1: 1182, 3: 1125, 0: 164, 4: 148})
0.923828125


In [11]:
def get_each_folder(dataset, fold_id, batch_size=1):
    
    fold_test = dataset.get_test_fold(fold_id, batch_size=batch_size, shuffle=True).dataset
    fold_train, fold_val = dataset.get_model_selection_fold(fold_id, inner_idx=None,
                                                                          batch_size=batch_size, shuffle=True)
    fold_train = fold_train.dataset
    fold_val = fold_val.dataset
    
    # train_G = [pyg_utils.to_networkx(d, node_attrs=['x']) for d in fold_train.get_subset()]
    # test_G = [pyg_utils.to_networkx(d, node_attrs=['x']) for d in fold_test.get_subset()]
    # print('x: ',train_G[0].nodes[0]['x'])
    
    train_adjs, test_adjs = [], []
    train_y, test_y = [], []
    
    def node_fea_to_dict(node_fea):
        res = {}
        for i in range(node_fea.shape[0]):
            res[i] = node_fea[i]
        return res
    print('train len:', len(fold_train))
    print('test len:', len(fold_test))
    print('total len: ', len(dataset.dataset))
    
    
    if hasattr(fold_train, "get_subset"):
        for d in fold_train.get_subset():

            train_y.append(d.y.item())
            train_adjs.append([d.to_numpy_array()])

        for d in fold_test.get_subset():
            test_y.append(d.y.item())
            test_adjs.append([d.to_numpy_array()])
            
    else:
        train_adjs = dataset.get_dense_adjs(fold_train)
        test_adjs = dataset.get_dense_adjs(fold_test)
        
        for d in fold_train:
            train_y.append(d.y)
        # is_labeled = data.y == data.y
        for d in fold_test:
            test_y.append(d.y)
            
        train_y = torch.cat(train_y, dim=0)
        test_y = torch.cat(test_y, dim=0)
        
        print('train y shape:', train_y.shape)
        print('test y shape:', test_y.shape)
    return train_adjs, test_adjs, train_y, test_y
    # do not use val for kernel methods.
#     for d in fold.dataset.get_subset():

In [12]:
# Transform from networkx
from grakel.utils import graph_from_networkx

# Multilabel Classification Example:

In [13]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score

from grakel.datasets import fetch_dataset
from grakel.kernels import ShortestPath
import numpy as np
from grakel.kernels import WeisfeilerLehman,SubgraphMatching
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from grakel import Graph
from grakel import utils as g_utils

import networkx as nx
# Loads the MUTAG dataset



# Define the Weisfeiler-Lehman kernel

def train_with_wl_kernel(wl_kernel, train_adj_matrices, test_adj_matrices, train_labels, test_labels):
    y_train = train_labels
    y_test = test_labels
    
    
    def transform_to_gr_graphs(adjs):
        nx_gs = []
        all_node_labels = []
        for m in adjs:
            if isinstance(m, list) or len(m.shape) > 2:
                nx_g = nx.from_numpy_array(m[0])
                N = m[0].shape[0]
            else:
                nx_g = nx.from_numpy_array(m)
                N = m.shape[0]
                
            node_labels = {i:0 for i in range(N)}
            nx_gs.append(nx_g)
            all_node_labels.append(node_labels)
        
        gr_graphs =  [g for g in g_utils.graph_from_networkx(nx_gs, as_Graph=True)]
        
        for i, g in enumerate(gr_graphs):
            g.node_labels = all_node_labels[i]
            
        return gr_graphs
    
    
    train_graphs = transform_to_gr_graphs(train_adj_matrices)
    test_graphs = transform_to_gr_graphs(test_adj_matrices)
    
    wl_kernel.fit(train_graphs)

    # Transform the graphs using the Weisfeiler-Lehman kernel
    X_train = wl_kernel.transform([graph for graph in train_graphs])
    X_test = wl_kernel.transform([graph for graph in test_graphs])

    # Train an SVM classifier on the transformed training data
    svm = SVC()
    
    if y_train.dim() > 1 and y_train.shape[-1] > 1:
        multilabel_classifier = MultiOutputClassifier(svm, n_jobs=-1)

        # Fit the data to the Multilabel classifier
        not_label = y_train == y_train
        
        multilabel_classifier = multilabel_classifier.fit(X_train[not_label], y_train[not_label])

        # Get predictions for test data
        y_test_pred = multilabel_classifier.predict(X_test)
    else:
        svm.fit(X_train, y_train)
        # Predict labels on the validation and test data using the trained SVM classifier
        y_test_pred = svm.predict(X_test)

    # Calculate the accuracy of the SVM classifier on the validation and test data
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    return test_accuracy



# MUTAG = fetch_dataset("MUTAG", verbose=False)
# G, y = MUTAG.data, MUTAG.target
# print('G10:', G[0])

def train_with_kernel(gk, dataset_name):
    res=[]
    for i in range(10):
        G_train, G_test, y_train, y_test = get_each_folder(dataset_name, i)
        
        # G_train = [g for g in graph_from_networkx(G_train,node_labels_tag='x')]
        # G_test = [g for g in graph_from_networkx(G_test,node_labels_tag='x')]
        # print('G_train 10:',G_train[:10])
        
        # G_train, G_test, y_train, y_test = train_test_split(G_train, y_train, test_size=0.1)
        # Uses the shortest path kernel to generate the kernel matrices
        if isinstance(gk, WeisfeilerLehman) or isinstance(gk, SubgraphMatching):
            res.append(train_with_wl_kernel(gk,  G_train, G_test, y_train, y_test))
        else:
            K_train = gk.fit_transform(G_train)
            K_test = gk.transform(G_test)

            # Uses the SVM classifier to perform classification
            clf = SVC(kernel="precomputed")
            clf.fit(K_train, y_train)
            y_pred = clf.predict(K_test)

            # Computes and prints the classification accuracy
            acc = accuracy_score(y_test, y_pred)
            res.append(acc)
            # print("Accuracy:", str(round(acc*100, 2)) + "%")
        
    res = np.array(res)
    print(f'Acc, mean: {round(np.mean(res)*100, 4)}, std: {round(100*np.std(res),4)}')

# Train syn

In [14]:
# MUTAG

from grakel.kernels import ShortestPath, WeisfeilerLehman, SubgraphMatching

for d in cc_datasets:
    train_with_kernel(WeisfeilerLehman(n_iter=25), d)
    break
    # train_with_kernel(SubgraphMatching(), data_names[0])
    # train_with_kernel(ShortestPath(normalize=True, with_labels=False), data_names[0])
        

<class 'dict'>
2
<class 'list'>
3317
221
train len: 3317
test len: 410
total len:  4096
train y shape: torch.Size([3317])
test y shape: torch.Size([410])
<class 'dict'>
2
<class 'list'>
3317
266
train len: 3317
test len: 410
total len:  4096
train y shape: torch.Size([3317])
test y shape: torch.Size([410])
<class 'dict'>
2
<class 'list'>
3317
3804
train len: 3317
test len: 410
total len:  4096
train y shape: torch.Size([3317])
test y shape: torch.Size([410])
<class 'dict'>
2
<class 'list'>
3317
1822
train len: 3317
test len: 410
total len:  4096
train y shape: torch.Size([3317])
test y shape: torch.Size([410])
<class 'dict'>
2
<class 'list'>
3317
1455
train len: 3317
test len: 410
total len:  4096
train y shape: torch.Size([3317])
test y shape: torch.Size([410])
<class 'dict'>
2
<class 'list'>
3317
2136
train len: 3317
test len: 410
total len:  4096
train y shape: torch.Size([3317])
test y shape: torch.Size([410])
<class 'dict'>
2
<class 'list'>
3318
3158
train len: 3318
test len: 409


In [59]:
# MUTAG

from grakel.kernels import ShortestPath, WeisfeilerLehman, SubgraphMatching



print('kernel on:', data_names[0])

gks = [ShortestPath(normalize=True, with_labels=False),
      WeisfeilerLehman(n_iter=5),
      SubgraphMatching(normalize=True)]

train_with_kernel(WeisfeilerLehman(n_iter=5), data_names[0])
train_with_kernel(SubgraphMatching(), data_names[0])
train_with_kernel(ShortestPath(normalize=True, with_labels=False), data_names[0])
    

kernel on: MUTAG
Acc, mean: 85.117, std: 8.0719
Acc, mean: 85.117, std: 8.0719
Acc, mean: 78.7719, std: 6.575


In [6]:
# run:

from grakel.kernels import ShortestPath, WeisfeilerLehman, SubgraphMatching


for name in data_names:
    print('kernel on:', name)
    gks = [ShortestPath(normalize=True, with_labels=False),
        WeisfeilerLehman(n_iter=5),
        SubgraphMatching(normalize=True)]

    train_with_kernel(WeisfeilerLehman(n_iter=5), data_names[0])
    train_with_kernel(SubgraphMatching(), data_names[0])
    train_with_kernel(ShortestPath(normalize=True, with_labels=False), data_names[0])
    

kernel on: DD


# SVM for $|V|+\alpha|E|$

In [7]:
from dataset_utils.node_feature_utils import graph_invariant

def train_simple_svm(kernel_name, dataset_name, folds_num=10):
    res = []
    auc = []
    for i in range(folds_num):
        train_adjs, test_adjs, train_y, test_y= get_each_folder(dataset_name, i)
        # NOTE: adj -> graph_features
        
        train_x = [graph_invariant(adj=adj[0]) for adj in train_adjs]
        test_x = [graph_invariant(adj=adj[0]) for adj in test_adjs]
        Classifier = SVC(kernel=kernel_name)
        Classifier.fit(train_x, train_y)
        y_pred = Classifier.predict(test_x)
        # Computes and prints the classification accuracy
        acc = accuracy_score(test_y, y_pred)
        rocauc = roc_auc_score(test_y, y_pred)
        res.append(acc)
        auc.append(rocauc)
        # print("Accuracy:", str(round(acc*100, 2)) + "%")
         
    res = np.array(res)
    auc = np.array(auc)
    
    print(f'Acc, mean: {round(np.mean(res)*100, 4)}, std: {round(100*np.std(res),4)}')
    print(f'ROCAUC, mean: {round(np.mean(auc)*100, 4)}, std: {round(100*np.std(auc),4)}')





In [28]:
# MUTAG:
print('kernel on:', data_names[0])
for kr in ['linear', 'poly', 'rbf', 'sigmoid']:
    train_simple_svm(kr)

Acc, mean: 85.117, std: 8.0719
Acc, mean: 82.9532, std: 7.7978
Acc, mean: 86.2281, std: 8.5031
Acc, mean: 66.4912, std: 2.2807


In [7]:
# Proteins:

print('kernel on:', data_names[0])
for kr in ['linear', 'poly', 'rbf', 'sigmoid']:
    print('kernel used:', kr)
    train_simple_svm(kr)

kernel on: PROTEINS
Acc, mean: 69.1739, std: 4.5649
Acc, mean: 59.5681, std: 0.1659
Acc, mean: 72.5, std: 2.5759
Acc, mean: 59.1144, std: 5.4115


In [7]:
# DD:

print('kernel on:', data_names[0])
for kr in ['linear', 'poly', 'rbf', 'sigmoid']:
    print('kernel used:', kr)
    train_simple_svm(kr)

kernel on: DD
kernel used: linear
Acc, mean: 75.5556, std: 2.3243
kernel used: poly
Acc, mean: 65.8743, std: 2.5075
kernel used: rbf
Acc, mean: 76.0648, std: 3.2092
kernel used: sigmoid
Acc, mean: 62.3801, std: 12.4925


In [14]:
# ENZYMES:

print('kernel on:', data_names[0])
for kr in ['linear', 'poly', 'rbf', 'sigmoid']:
    train_simple_svm(kr)

kernel on: ENZYMES
Acc, mean: 22.3333, std: 4.6068
Acc, mean: 20.5, std: 3.5785
Acc, mean: 22.3333, std: 3.8152
Acc, mean: 13.3333, std: 4.5947


In [11]:
# NCI1:

print('kernel on:', data_names[0])
for kr in ['linear', 'poly', 'rbf', 'sigmoid']:
    train_simple_svm(kr)

kernel on: NCI1
Acc, mean: 62.5061, std: 1.8026
Acc, mean: 59.927, std: 1.4106
Acc, mean: 62.5061, std: 1.7927
Acc, mean: 37.7859, std: 2.1462


: 

In [8]:
# COLLAB:

print('kernel on:', data_names[0])
for kr in ['linear', 'poly', 'rbf']:
    train_simple_svm(kr)

kernel on: COLLAB
Acc, mean: 53.88, std: 1.1737
Acc, mean: 60.82, std: 0.6161
Acc, mean: 61.66, std: 1.1351


In [9]:
# IMDB-M:

print('kernel on:', data_names[0])
for kr in ['poly', 'rbf']:
    train_simple_svm(kr, data_names[0])

kernel on: IMDB-MULTI
Acc, mean: 33.4, std: 0.8138
Acc, mean: 40.3333, std: 3.7977


In [6]:
# CIFAR10

print('kernel on:', data_names[0])
for kr in ['poly', 'rbf']:
    train_simple_svm(kr, data_names[0], folds_num=1)

kernel on: CIFAR10
idxs keys: dict_keys(['train', 'validation'])




Acc, mean: 12.31, std: 0.0
idxs keys: dict_keys(['train', 'validation'])
Acc, mean: 14.62, std: 0.0


In [7]:
# HIV
print('kernel on:', data_names[0])
for kr in ['poly', 'rbf']:
    train_simple_svm(kr, data_names[0], folds_num=1)

kernel on: ogbg_molhiv
idxs keys: dict_keys(['train', 'validation'])
Acc, mean: 96.8393, std: 0.0
ROCAUC, mean: 50.0, std: 0.0
idxs keys: dict_keys(['train', 'validation'])
Acc, mean: 96.8393, std: 0.0
ROCAUC, mean: 50.0, std: 0.0


In [8]:
# REDDIT-B
print('kernel on:', data_names[0])
for kr in ['rbf']:
    train_simple_svm(kr, data_names[0], folds_num=1)

kernel on: REDDIT-BINARY
idxs keys: dict_keys(['train', 'validation'])
Acc, mean: 79.0, std: 0.0
ROCAUC, mean: 79.0, std: 0.0


In [9]:
# REDDIT-B
print('kernel on:', data_names[0])
for kr in ['poly']:
    train_simple_svm(kr, data_names[0], folds_num=1)

kernel on: REDDIT-BINARY
idxs keys: dict_keys(['train', 'validation'])
Acc, mean: 61.0, std: 0.0
ROCAUC, mean: 61.0, std: 0.0
