## Deep Graph Library Dataset

In [77]:
import os
import sys
kernel_name = os.path.basename(sys.executable.replace("/bin/python",""))

if kernel_name == 'py38cu11':
    import ctypes
    ctypes.cdll.LoadLibrary("/apps/gilbreth/cuda-toolkit/cuda-11.2.0/lib64/libcusparse.so.11");
    ctypes.cdll.LoadLibrary("/apps/gilbreth/cuda-toolkit/cuda-11.2.0/lib64/libcublas.so.11");

In [78]:
import os

os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [79]:
from sklearn.model_selection import train_test_split

def train_val_test_mask(graph, train=0.6, val=0.2, test=0.2, random_state=False):
    
    N = graph.num_nodes()
    indexs = list(range(N))
    
    if random_state:
        train_index, test_index = train_test_split(indexs, test_size=val+test)
        val_index, test_index = train_test_split(test_index, test_size=test/(val+test))
    else:        
        train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
        val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)

    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    
    return None


In [80]:
def get_dataset(DIR, DATASET_NAME, log = True):
    
    if DATASET_NAME == 'karate':
        dataset = dgl.data.KarateClubDataset()
        #custom train mask
    
    elif DATASET_NAME == 'Cora':
        dataset = dgl.data.CoraGraphDataset(raw_dir = DIR)        

    elif DATASET_NAME in ['CiteSeer']:
        dataset = dgl.data.CiteseerGraphDataset(raw_dir = DIR)
        
    elif DATASET_NAME == 'PubMed':
        dataset = dgl.data.PubmedGraphDataset(raw_dir = DIR)
        
    elif DATASET_NAME == 'cora':
        dataset = dgl.data.CoraFullDataset(raw_dir = DIR)
        #custom train mask
    
    elif DATASET_NAME == 'Computer':
        dataset = dgl.data.AmazonCoBuyComputerDataset(raw_dir = DIR)
        #custom train mask
        
    elif DATASET_NAME in ['Photos', 'Photo']:
        dataset = dgl.data.AmazonCoBuyPhotoDataset(raw_dir = DIR)
        #custom train mask
        
    elif DATASET_NAME in ['CS', 'CoauthorCS']:
        dataset = dgl.data.CoauthorCSDataset(raw_dir = DIR)
        #custom train mask
        
    elif DATASET_NAME in ['Physics', 'CoauthorPhysics']:
        dataset = dgl.data.CoauthorPhysicsDataset(raw_dir = DIR)
        #custom train mask
        
    elif DATASET_NAME == 'Reddit':
        dataset = dgl.data.RedditDataset(raw_dir = DIR)
        
    elif DATASET_NAME == 'fraudyelp':
        dataset = dgl.data.FraudDataset(raw_dir = DIR, name = 'yelp')
        g = dataset[0]
        g.ndata['feat'] = g.ndata['feature']
        
    elif DATASET_NAME in ['flickr','Flickr']:
        dataset = dgl.data.FlickrDataset(raw_dir = DIR)
    
    elif DATASET_NAME  == 'yelp':
        dataset = dgl.data.YelpDataset(raw_dir = DIR+'yelp/') 
               
    elif DATASET_NAME  in ['Chameleon','chameleon']:
        from ipynb.fs.full.HeterophilicDataset import ChameleonDataset
        dataset = ChameleonDataset(raw_dir = DIR) 
        
    elif DATASET_NAME  in ['Squirrel','squirrel']:
        from ipynb.fs.full.HeterophilicDataset import SquirrelDataset
        dataset = SquirrelDataset(raw_dir = DIR) 
    
    elif DATASET_NAME  in ['Cornell','cornell']:
        from ipynb.fs.full.HeterophilicDataset import CornellDataset
        dataset = CornellDataset(raw_dir = DIR) 
    
    elif DATASET_NAME  in ['Actor','actor','Film','film']:
        from ipynb.fs.full.Actor import ActorDataset
        dataset = ActorDataset(raw_dir = DIR) 
        
    elif DATASET_NAME  in ['Texas', 'texas']:
        from ipynb.fs.full.HeterophilicDataset import TexasDataset
        dataset = TexasDataset(raw_dir = DIR)    
    
    elif DATASET_NAME  in ['Winconsin', 'winconsin']:
        from ipynb.fs.full.HeterophilicDataset import WisconsinDataset
        dataset = WisconsinDataset(raw_dir = DIR)  
        
    elif DATASET_NAME in ['ogbn-arxiv', 'arxiv']:
        from ogb.nodeproppred import DglNodePropPredDataset
        dataset = DglNodePropPredDataset(root = DIR, name="ogbn-arxiv")
        g, node_labels = dataset[0]
        g = dgl.add_reverse_edges(g)
        g.ndata["label"] = node_labels[:, 0]
        
        return dataset, g
        
    g = dataset[0]
    
#     print(g)    
#     print('--'*50)
    
    if 'train_mask' not in g.ndata:
        train_val_test_mask(g, train=0.6, val=0.2, test=0.2, random_state=1)
    if 'feat' not in g.ndata:
        g.ndata['feat'] = torch.eye(g.num_nodes())
    
    if log:
        
        print("Class: ", dataset.num_classes)
        print("Feature: ",g.ndata['feat'].shape)        
        
        print("Train Mask: ",g.ndata['train_mask'].shape)
        print("Val Mask: ",g.ndata['val_mask'].shape)
        print("Test Mask: ",g.ndata['test_mask'].shape)
        print("Label Mask: ",g.ndata['label'].shape)
                
    return dataset, g

In [81]:
# import DeviceDir
    
# DIR, RESULTS_DIR = DeviceDir.get_directory()
# device, NUM_PROCESSORS = DeviceDir.get_device()

In [82]:
# dataset, graph = get_dataset(DIR, 'Chameleon')

# graph = dataset[0]
# print(graph.num_nodes())
# print(graph.num_edges())
# print(dataset.num_classes)
# print(graph.edges())

In [88]:
def get_pyg_as_dgl(DATASET_NAME='karate', DIR=None, params=None, train=None, random_state=False, log=True, h_score=False):
    
    from ipynb.fs.full.PygDataset import get_data
    import torch_geometric
    from ipynb.fs.full.PygLib import to_dgl
    
    data, dataset = get_data(DATASET_NAME=DATASET_NAME, DIR=DIR, params=params, train=train, random_state=random_state, log=log, h_score=h_score)
    
    ## This is only available in the latest pytorch geometric, 2.1.x
    g = to_dgl(data)
    
    g.ndata['feat'] = g.ndata['x']
    g.ndata['label'] = g.ndata['y']
    
    return dataset, g

def get_dgl_as_pyg(DATASET_NAME='karate', DIR=None,log=True, split_no=0):    
    import torch_geometric
    from ipynb.fs.full.PygLib import from_dgl
    
    dataset, graph = get_dataset(DIR, DATASET_NAME, log = log)
    
    ## This is only available in the latest pytorch geometric, 2.1.x
    g = from_dgl(graph)
    
    if g.train_mask.dim()>1:
        g.train_mask = g.train_mask[:,split_no]
        g.val_mask = g.val_mask[:,split_no]
        g.test_mask = g.test_mask[:,split_no]
    
    g.x = g.feat
    g.y = g.label
    g.feat = None
    g.label = None
    
    print(g)
        
    return dataset, g

In [84]:
def generate_synthetic(graph, dataset, d=5, h=0.8, train=0.6, random_state=None, log=True):
    num_class = dataset.num_classes
    
    cluster_vertices = {}
    for c in range(num_class):
        indices = torch.where(graph.ndata['label'] == c)[0]
        cluster_vertices[c]=indices
    
    n = graph.num_nodes()
    intra_d = np.random.multinomial(n*d*h, np.ones(n)/n, size=1)[0]
    inter_d = np.random.multinomial(n*d*(1-h), np.ones(n)/n, size=1)[0]
    
    edge_index = [[],[]]
    
    for c in range(num_class):
        intra_vertices = cluster_vertices[c]
        inter_vertices = torch.cat([value for key, value in cluster_vertices.items() if key!=c])
        
        for u in intra_vertices:
            intra_v = np.random.choice(intra_vertices, min(len(intra_vertices),intra_d[u]), replace=False)
            inter_v = np.random.choice(inter_vertices, min(len(intra_vertices),inter_d[u]), replace=False)
            Vs = np.append(intra_v,inter_v)
            Us = np.repeat(u.item(),len(Vs))
            edge_index[0].extend(Us)
            edge_index[1].extend(Vs)
    
    edge_index = torch.LongTensor(edge_index)    
    graph.remove_edges(torch.arange(graph.number_of_edges()))
    graph.add_edges(edge_index[0],edge_index[1])

    
    if train is not None:
        val = (1-train)/2.0
        train_val_test_mask(graph, train=train, val=val, test=1-(train+val), random_state=random_state)
    
    return None

# generate_synthetic(graph, dataset, d=10, h=0.3, train=0.6, random_state=None, log=True)
# graph

In [85]:
if __name__ == '__main__':
    
    import DeviceDir
    
    DIR, RESULTS_DIR = DeviceDir.get_directory()
    device, NUM_PROCESSORS = DeviceDir.get_device()
    
#     dataset, g = get_dataset(DIR, 'squirrel')
        
    dataset, g = get_pyg_as_dgl(DATASET_NAME='squirrel', DIR=DIR)
    print(g)
    
    print("Class: ", dataset.num_classes)
    print("Feature: ",g.ndata['feat'].shape)        

    print("Train Mask: ",g.ndata['train_mask'].shape)
    print("Val Mask: ",g.ndata['val_mask'].shape)
    print("Test Mask: ",g.ndata['test_mask'].shape)
    print("Label Mask: ",g.ndata['label'].shape)

    None
    

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A10
cuda
Cpu count:  32
Looking at:  /scratch/gilbreth/das90/Dataset/
Data directory:  /scratch/gilbreth/das90/Dataset/
Result directory: /scratch/gilbreth/das90/Dataset/RESULTS/

Dataset: HeteroDataset():
Number of graphs: 1
Number of features: 2089
Number of classes: 5

Data(x=[5201, 2089], edge_index=[2, 217073], y=[5201], node_id=[5201], train_mask=[5201], val_mask=[5201], test_mask=[5201])
Number of nodes: 5201
Number of edges: 217073
Average node degree: 41.74
Number of training nodes: 3120
Training node label rate: 0.60
Has isolated nodes: False
Has self-loops: True
Is undirected: False
Graph(num_nodes=5201, num_edges=217073,
      ndata_schemes={'x': Scheme(shape=(2089,), dtype=torch.float32), 'node_id': Scheme(shape=(), dtype=torch.int64), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'y': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.b

In [89]:
# import DeviceDir
    
# DIR, RESULTS_DIR = DeviceDir.get_directory()
# device, NUM_PROCESSORS = DeviceDir.get_device()

# dataset, g = get_dgl_as_pyg(DATASET_NAME='Squirrel', DIR=DIR, split_no=0)

# print(g)


There are 1 GPU(s) available.
We will use the GPU: NVIDIA A10
cuda
Cpu count:  32
Done loading data from cached files.
Class:  5
Feature:  torch.Size([5201, 2089])
Train Mask:  torch.Size([5201, 10])
Val Mask:  torch.Size([5201, 10])
Test Mask:  torch.Size([5201, 10])
Label Mask:  torch.Size([5201])
Data(edge_index=[2, 217073], train_mask=[5201], val_mask=[5201], test_mask=[5201], x=[5201, 2089], y=[5201])
Data(edge_index=[2, 217073], train_mask=[5201], val_mask=[5201], test_mask=[5201], x=[5201, 2089], y=[5201])


In [87]:
# datasets = [        
#     'cora', 
#     'Cora',
#     'citeseer',
#     'CiteSeer',
#     'PubMed',
#     'dblp',
#     'Reddit','Reddit2',
#     'Computers','Photo','CS',
#     'Physics','Flickr',
#     'film',
#     'cornell',
#     'texas',
#     'wisconsin',
#     'squirrel',
#     'chameleon',
#     'Fake',
#     'Moon',
#     'AmazonProducts',
#     'Yelp',
#     'karate',
#     'ogbn-arxiv'
#     ]