In [1]:
import os
import sys
if not os.getcwd().endswith("Submodular"):
    sys.path.append('../../Submodular')    

In [2]:
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [19]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [20]:
import argparse
import sys
import os
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import to_undirected, sort_edge_index
from torch_geometric.data import NeighborSampler, ClusterData, ClusterLoader, Data, GraphSAINTNodeSampler, GraphSAINTEdgeSampler, GraphSAINTRandomWalkSampler, RandomNodeSampler
from torch_scatter import scatter

from logger import Logger, SimpleLogger
from dataset import load_nc_dataset, NCDataset
from data_utils import normalize, gen_normalized_adjs, evaluate, eval_acc, eval_rocauc, to_sparse_tensor
from parse import parse_method, parser_add_main_args
from batch_utils import nc_dataset_to_torch_geo, torch_geo_to_nc_dataset, AdjRowLoader, make_loader

In [21]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    
    parser = ArgumentParser()
    
    ### Parse args ###
    parser = argparse.ArgumentParser(description='General Training Pipeline')
    parser_add_main_args(parser)
    parser.add_argument('--train_batch', type=str, default='cluster', help='type of mini batch loading scheme for training GNN')
    parser.add_argument('--no_mini_batch_test', action='store_true', help='whether to test on mini batches as well')
    parser.add_argument('--batch_size', type=int, default=10000)
    parser.add_argument('--num_parts', type=int, default=100, help='number of partitions for partition batching')
    parser.add_argument('--cluster_batch_size', type=int, default=1, help='number of clusters to use per cluster-gcn step')
    parser.add_argument('--saint_num_steps', type=int, default=5, help='number of steps for graphsaint')
    parser.add_argument('--test_num_parts', type=int, default=10, help='number of partitions for testing')
    
    #parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    #parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    #parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    #parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('--use_normalization', action='store_true')    
    parser.add_argument('-f') ##dummy for jupyternotebook
    
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

In [22]:
import os.path as osp
import torch
import torch.nn as nn
import torch.nn.functional as F
# from torch_geometric.datasets import LINKXDataset
# from torch_geometric.nn import LINKX
import numpy as np
from tqdm import tqdm
from torch_geometric.loader import NeighborSampler, NeighborLoader
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn import GCNConv, SGConv, GATConv, JumpingKnowledge, APPNP, GCN2Conv, MessagePassing
from torch_geometric.nn.conv.gcn_conv import gcn_norm
import scipy.sparse
import time

# Dataset

In [23]:
args.log_info = True

DATASET_NAME = 'Cora'
args.dataset = DATASET_NAME
gnn_name = 'linkx'

args.method = gnn_name
args.train_batch = 'random'
args.num_parts = 100

# data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=True, split_no=0); print("")
# print(data)

In [30]:
# args.dataset = 'fb100'
# args.sub_dataset = 'Penn94'

#pokec, arxiv-year

args.dataset = 'wiki'
args.sub_dataset = ''

dataset = load_nc_dataset(args.dataset, args.sub_dataset)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1ySNspxbK-snNoAZM7oxiWGvOnTRdSyEK
From (redirected): https://drive.google.com/uc?id=1ySNspxbK-snNoAZM7oxiWGvOnTRdSyEK&confirm=t&uuid=122487d5-be2a-44da-b97c-f978dcaeed8e
To: /scratch/gilbreth/das90/Dataset/LINKX/data/wiki_features2M.pt
100%|██████████| 4.62G/4.62G [00:39<00:00, 118MB/s] 
Downloading...
From (uriginal): https://drive.google.com/uc?id=14X7FlkjrlUgmnsYtPwdh-gGuFla4yb5u
From (redirected): https://drive.google.com/uc?id=14X7FlkjrlUgmnsYtPwdh-gGuFla4yb5u&confirm=t&uuid=265be899-a60c-4443-9f74-36520d73f3f3
To: /scratch/gilbreth/das90/Dataset/LINKX/data/wiki_edges2M.pt
100%|██████████| 4.85G/4.85G [00:34<00:00, 140MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1p5DlVHrnFgYm3VsNIzahSsvCD424AyvP
To: /scratch/gilbreth/das90/Dataset/LINKX/data/wiki_views2M.pt
100%|██████████| 15.4M/15.4M [00:00<00:00, 83.9MB/s]


edges shape: torch.Size([2, 303434860])
features shape: 1925342
Label shape: 1925342


In [31]:
#dataset
dataset[0]

({'edge_index': tensor([[      0,       0,       0,  ..., 1924550, 1924550, 1924550],
          [      1,       2,       3,  ...,  557978,   61041, 1920560]]),
  'edge_feat': None,
  'node_feat': tensor([[ 1.2721e-01, -1.1162e-01,  6.0405e-03,  ..., -1.0686e-01,
           -5.9631e-02, -7.6529e-03],
          [-1.7307e-01, -4.5984e-02,  2.5106e-01,  ..., -8.6675e-02,
           -7.4201e-02,  4.2817e-02],
          [-1.6128e-01,  2.4741e-01, -2.0739e-02,  ..., -1.2163e-01,
           -2.8721e-02, -1.8481e-02],
          ...,
          [ 2.2221e-01,  1.4399e-01,  8.5297e-02,  ..., -1.5650e-01,
           -1.5056e-01,  1.0138e-02],
          [-8.9892e-02,  9.6628e-02,  1.5188e-02,  ..., -1.2053e-01,
           -8.2116e-02, -4.7099e-02],
          [-1.6750e-04,  8.1035e-02,  1.0128e-01,  ..., -1.0855e-01,
           -1.6818e-01,  5.5043e-03]]),
  'num_nodes': 1925342},
 tensor([ 4,  4,  4,  ..., -1, -1, -1]))

In [32]:
geo_data = nc_dataset_to_torch_geo(dataset, 0)

In [33]:
geo_data

Data(x=[1925342, 600], edge_index=[2, 303434860], y=[1925342], node_ids=[1925342], mask=[1925342])

In [28]:
# FileName = DIR + 'LINKXdataset/'+ args.dataset+'.pt'

# # print(FileName)
# # if not os.path.exists(FileName):
# #         os.makedirs(FileName)

# torch.save(geo_data,FileName)

In [34]:
FolderName = DIR + 'LINKXdataset/'+ args.dataset+'/'

print(FolderName)
if not os.path.exists(FolderName):
        os.makedirs(FolderName)
        
    
torch.save(geo_data.x, FolderName+'x.pt')
torch.save(geo_data.edge_index, FolderName+'edge_index.pt')
torch.save(geo_data.y, FolderName+'y.pt')

/scratch/gilbreth/das90/Dataset/LINKXdataset/wiki/


In [4]:
from torch_geometric.datasets import AttributedGraphDataset

In [None]:
# datasets = ["BlogCatalog", "PPI", "Facebook", "Twitter", "MAG"]

# #DATASET_NAME in ["BlogCatalog", "PPI", "Facebook", "Twitter", "TWeibo", "MAG"]:

# for DATASET_NAME in datasets:

# #DATASET_NAME = "BlogCatalog"        
#     dataset = AttributedGraphDataset(root=DIR+'/AttributedGraphDatasetPYG2', name=DATASET_NAME)
        
#     print(dataset)
#     print(dataset[0])        
    

Blogcatalog()
Data(x=[5196, 8189], edge_index=[2, 343486], y=[5196])
Ppi()
Data(x=[56944, 50], edge_index=[2, 1612348], y=[56944, 121])
Facebook()
Data(x=[4039, 1283], edge_index=[2, 88234], y=[4039, 193])
Twitter()
Data(x=[81306, 216839, nnz=94616433], edge_index=[2, 2420766], y=[81306, 4065])


Downloading https://docs.google.com/uc?export=download&id=1ggraUMrQgdUyA3DjSRzzqMv0jFkU65V5&confirm=t
Extracting /scratch/gilbreth/das90/Dataset/AttributedGraphDatasetPYG2/mag/raw/uc
Processing...
