In [1]:
import os
import sys
if not os.getcwd().endswith("Submodular"):
    sys.path.append('../../Submodular')    

In [2]:
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [3]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [4]:
import argparse
import sys
import os
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.utils import to_undirected, sort_edge_index
from torch_geometric.data import NeighborSampler, ClusterData, ClusterLoader, Data, GraphSAINTNodeSampler, GraphSAINTEdgeSampler, GraphSAINTRandomWalkSampler, RandomNodeSampler
from torch_scatter import scatter

from logger import Logger, SimpleLogger
from dataset import load_nc_dataset, NCDataset
from data_utils import normalize, gen_normalized_adjs, evaluate, eval_acc, eval_rocauc, to_sparse_tensor
from parse import parse_method, parser_add_main_args
from batch_utils import nc_dataset_to_torch_geo, torch_geo_to_nc_dataset, AdjRowLoader, make_loader

In [5]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    
    parser = ArgumentParser()
    
    ### Parse args ###
    parser = argparse.ArgumentParser(description='General Training Pipeline')
    parser_add_main_args(parser)
    parser.add_argument('--train_batch', type=str, default='cluster', help='type of mini batch loading scheme for training GNN')
    parser.add_argument('--no_mini_batch_test', action='store_true', help='whether to test on mini batches as well')
    parser.add_argument('--batch_size', type=int, default=10000)
    parser.add_argument('--num_parts', type=int, default=100, help='number of partitions for partition batching')
    parser.add_argument('--cluster_batch_size', type=int, default=1, help='number of clusters to use per cluster-gcn step')
    parser.add_argument('--saint_num_steps', type=int, default=5, help='number of steps for graphsaint')
    parser.add_argument('--test_num_parts', type=int, default=10, help='number of partitions for testing')
    
    #parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    #parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    #parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    #parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('--use_normalization', action='store_true')    
    parser.add_argument('-f') ##dummy for jupyternotebook
    
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

In [6]:
import os.path as osp
import torch
import torch.nn.functional as F
# from torch_geometric.datasets import LINKXDataset
# from torch_geometric.nn import LINKX
import numpy as np
from tqdm import tqdm
from torch_geometric.loader import NeighborSampler, NeighborLoader

# LINKX model

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_sparse import SparseTensor, matmul
from torch_geometric.nn import GCNConv, SGConv, GATConv, JumpingKnowledge, APPNP, GCN2Conv, MessagePassing
from torch_geometric.nn.conv.gcn_conv import gcn_norm
import numpy as np
import scipy.sparse
from tqdm import tqdm

In [8]:
class MLP(nn.Module):
    """ adapted from https://github.com/CUAI/CorrectAndSmooth/blob/master/gen_models.py """
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout=.5):
        super(MLP, self).__init__()
        self.lins = nn.ModuleList()
        self.bns = nn.ModuleList()
        if num_layers == 1:
            # just linear layer i.e. logistic regression
            self.lins.append(nn.Linear(in_channels, out_channels))
        else:
            self.lins.append(nn.Linear(in_channels, hidden_channels))
            self.bns.append(nn.BatchNorm1d(hidden_channels))
            for _ in range(num_layers - 2):
                self.lins.append(nn.Linear(hidden_channels, hidden_channels))
                self.bns.append(nn.BatchNorm1d(hidden_channels))
            self.lins.append(nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, batch_data, input_tensor=False):
        if not input_tensor:
            x = batch_data.x
        else:
            x = batch_data
        for i, lin in enumerate(self.lins[:-1]):
            x = lin(x)
            x = F.relu(x, inplace=True)
            x = self.bns[i](x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return x

In [9]:
class LINKXcustom(nn.Module):	
    """ our LINKX method with skip connections 
        a = MLP_1(A), x = MLP_2(X), MLP_3(sigma(W_1[a, x] + a + x))
    """

    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, num_nodes, dropout=.5, cache=False, inner_activation=False, inner_dropout=False, init_layers_A=1, init_layers_X=1):
        super(LINKXcustom, self).__init__()	
        self.mlpA = MLP(num_nodes, hidden_channels, hidden_channels, init_layers_A, dropout=0)
        self.mlpX = MLP(in_channels, hidden_channels, hidden_channels, init_layers_X, dropout=0)
        self.W = nn.Linear(2*hidden_channels, hidden_channels)
        self.mlp_final = MLP(hidden_channels, hidden_channels, out_channels, num_layers, dropout=dropout)
        self.in_channels = in_channels
        self.num_nodes = num_nodes
        self.A = None
        self.inner_activation = inner_activation
        self.inner_dropout = inner_dropout

    def reset_parameters(self):	
        self.mlpA.reset_parameters()	
        self.mlpX.reset_parameters()
        self.W.reset_parameters()
        self.mlp_final.reset_parameters()	

    def forward(self, batch_data):	
        m = batch_data.num_nodes	
        feat_dim = batch_data.x
        row, col = batch_data.edge_index
        row = row-row.min()
        A = SparseTensor(row=row, col=col,	
                 sparse_sizes=(m, self.num_nodes)
                        ).to_torch_sparse_coo_tensor()

        xA = self.mlpA(A, input_tensor=True)
        xX = self.mlpX(batch_data.x, input_tensor=True)
        x = torch.cat((xA, xX), axis=-1)
        x = self.W(x)
        if self.inner_dropout:
            x = F.dropout(x)
        if self.inner_activation:
            x = F.relu(x)
        x = F.relu(x + xA + xX)
        x = self.mlp_final(x, input_tensor=True)

        return x

# Train

In [10]:
def test(model, loader, mask, name='Train'):
    
    if args.log_info:
        pbar = tqdm(total=sum(mask).item())
        pbar.set_description(f'Evaluating {name}')

    model.eval()
    
    total_correct=0
    total_examples=0
    
    with torch.no_grad():                  
    
        for i,batch_data in enumerate(loader):
            batch_data = batch_data.to(device)
            #out = model(batch_data.x, batch_data.edge_index,batch_data.edge_weight)
            out = model(batch_data)
            out=out[:batch_data.batch_size,:]
            pred = out.argmax(dim=-1)            
            correct = pred.eq(batch_data.y[:batch_data.batch_size].to(device))

            total_correct+=correct.sum()
            total_examples+=batch_data.batch_size
            
            if args.log_info:
                pbar.update(batch_data.batch_size)
    
    if args.log_info:
        pbar.close()

    return total_correct.item()/total_examples

In [11]:
def train(model, data, epochs, train_neighbors=[8,4], test_neighbors=[8,4]):
    
    if args.log_info:
        print("Train Neighbors: ", train_neighbors)
        print("Test Neighbors: ", test_neighbors)
    
 
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-3)    
    batch_size=1024
    loader = NeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
                            batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = NeighborLoader(data,input_nodes=data.val_mask, num_neighbors=test_neighbors, 
                                batch_size=batch_size,shuffle=False, num_workers=0)
    test_loader = NeighborLoader(data, input_nodes=data.test_mask,num_neighbors=test_neighbors, 
                                 batch_size=batch_size,shuffle=False, num_workers=0)    
    
    train_losses=[]
    best_acc = 0 
    num_iteration = epochs
    
    for epoch in range(1,epochs+1):
        
        if args.log_info:
            pbar = tqdm(total=int(sum(data.train_mask)))
            pbar.set_description(f'Epoch {epoch:02d}')
        
        model.train()
        total_loss = total_examples = 0
        
        for i,batch_data in enumerate(loader):                
            batch_data = batch_data.to(device)
            
            print(batch_data)
            
            optimizer.zero_grad()
            #out = model(batch_data.x, batch_data.edge_index)
            out = model(batch_data)
            
            #loss = F.nll_loss(out[batch_data.train_mask], batch_data.y[batch_data.train_mask])
            loss = F.cross_entropy(out[batch_data.train_mask], batch_data.y[batch_data.train_mask])

            loss.backward()
            optimizer.step()
            total_loss += loss.item() * sum(batch_data.train_mask).item()
            total_examples += sum(batch_data.train_mask).item()
            
            if args.log_info:
                pbar.update(batch_size)
        
        if args.log_info:
            pbar.close()
        
        loss=total_loss / total_examples
        train_losses.append(loss)     
        
        #train_acc=test(model, train_loader,data.train_mask,'Train')
        train_acc=0
        val_acc = test(model, val_loader, data.val_mask,'Validation')
        test_acc = test(model, test_loader, data.test_mask,'Test')
                
        if test_acc>best_acc:
            best_acc=test_acc
        
        std_dev = np.std(train_losses[-5:])
        
        if args.log_info:
            print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}, Std dev: {std_dev:.4f}')
                
        if epoch>=5 and std_dev<=1e-4:
            num_iteration = epoch
            
            if args.log_info:                
                print("Iteration for convergence: ", epoch)
            break
                
    return best_acc, num_iteration

In [12]:
def LINKXperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[8,4], test_neighbors=[8,4]):        
    model = LINKXcustom(in_channels=data.x.shape[1], hidden_channels=64, out_channels = dataset.num_classes, 
                        num_layers=1, num_nodes=data.num_nodes).to(device)
    if args.log_info:
        print(model) 
    
    best_acc, num_iteration = train(model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    
    return best_acc, num_iteration, model

In [19]:
args.log_info = True
DATASET_NAME = 'karate'
data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=True, split_no=0); print("")
print(data)
best_acc, num_iteration, _ = LINKXperformanceSampler(data, dataset, dataset.num_classes, epochs=10, train_neighbors=[8,4], test_neighbors=[8,4])
print(best_acc, num_iteration)

N  34  E  156  d  4.588235294117647 0.8020520210266113 0.7564102411270142 0.6170591711997986 -0.4756128787994385 
Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
LINKXcustom(
  (mlpA): MLP(
    (lins): ModuleList(
      (0): Linear(in_features=34, out_features=64, bias=True)
    )
    (bns): ModuleList()
  )
  (mlpX): MLP(
    (lins): ModuleList(
      (0): Linear(in_features=34, out_features=64, bias=True)
    )
    (bns): ModuleList()
  )
  (W): Linear(in_features=128, out_features=64, bias=True)
  (mlp_final): MLP(
    (lins): ModuleList(
      (0): Linear(in_features=64, out_features=4, bias=True)
    )
    (bns): ModuleList()
  )
)
Train Neighbors:  [8, 4]
Test Neighbors:  [8, 4]


Epoch 01: : 1024it [00:00, 168522.61it/s]      


Data(x=[24, 34], edge_index=[2, 74], y=[24], train_mask=[24], val_mask=[24], test_mask=[24], n_id=[24], e_id=[74], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 13786.47it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 14543.36it/s]


Epoch: 001, Train Loss: 1.4029, Train: 0.0000, Val: 0.2667, Test: 0.3333, Std dev: 0.0000


Epoch 02: : 1024it [00:00, 258608.34it/s]      


Data(x=[26, 34], edge_index=[2, 66], y=[26], train_mask=[26], val_mask=[26], test_mask=[26], n_id=[26], e_id=[66], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 15147.36it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15194.92it/s]


Epoch: 002, Train Loss: 1.2962, Train: 0.0000, Val: 0.3333, Test: 0.3000, Std dev: 0.0534


Epoch 03: : 1024it [00:00, 249910.82it/s]      


Data(x=[28, 34], edge_index=[2, 65], y=[28], train_mask=[28], val_mask=[28], test_mask=[28], n_id=[28], e_id=[65], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14836.59it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15218.81it/s]


Epoch: 003, Train Loss: 1.2574, Train: 0.0000, Val: 0.2333, Test: 0.2667, Std dev: 0.0616


Epoch 04: : 1024it [00:00, 264338.21it/s]      


Data(x=[29, 34], edge_index=[2, 64], y=[29], train_mask=[29], val_mask=[29], test_mask=[29], n_id=[29], e_id=[64], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14632.99it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15167.44it/s]


Epoch: 004, Train Loss: 1.0965, Train: 0.0000, Val: 0.2667, Test: 0.2667, Std dev: 0.1100


Epoch 05: : 1024it [00:00, 260190.66it/s]      


Data(x=[24, 34], edge_index=[2, 69], y=[24], train_mask=[24], val_mask=[24], test_mask=[24], n_id=[24], e_id=[69], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14689.37it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15261.26it/s]


Epoch: 005, Train Loss: 0.9062, Train: 0.0000, Val: 0.2000, Test: 0.1667, Std dev: 0.1734


Epoch 06: : 1024it [00:00, 259201.41it/s]      


Data(x=[25, 34], edge_index=[2, 74], y=[25], train_mask=[25], val_mask=[25], test_mask=[25], n_id=[25], e_id=[74], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14597.35it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15239.08it/s]


Epoch: 006, Train Loss: 0.8546, Train: 0.0000, Val: 0.1333, Test: 0.1333, Std dev: 0.1786


Epoch 07: : 1024it [00:00, 258219.64it/s]      


Data(x=[27, 34], edge_index=[2, 65], y=[27], train_mask=[27], val_mask=[27], test_mask=[27], n_id=[27], e_id=[65], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14789.51it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15324.46it/s]


Epoch: 007, Train Loss: 0.6584, Train: 0.0000, Val: 0.1333, Test: 0.1333, Std dev: 0.2058


Epoch 08: : 1024it [00:00, 257122.08it/s]      


Data(x=[28, 34], edge_index=[2, 73], y=[28], train_mask=[28], val_mask=[28], test_mask=[28], n_id=[28], e_id=[73], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14540.00it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15094.66it/s]


Epoch: 008, Train Loss: 0.5849, Train: 0.0000, Val: 0.2000, Test: 0.1667, Std dev: 0.1825


Epoch 09: : 1024it [00:00, 258235.17it/s]      


Data(x=[28, 34], edge_index=[2, 71], y=[28], train_mask=[28], val_mask=[28], test_mask=[28], n_id=[28], e_id=[71], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14672.24it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15298.37it/s]


Epoch: 009, Train Loss: 0.6046, Train: 0.0000, Val: 0.1667, Test: 0.1667, Std dev: 0.1328


Epoch 10: : 1024it [00:00, 260759.35it/s]      


Data(x=[28, 34], edge_index=[2, 68], y=[28], train_mask=[28], val_mask=[28], test_mask=[28], n_id=[28], e_id=[68], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[4], batch_size=4)


Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14815.63it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 14643.21it/s]

Epoch: 010, Train Loss: 0.3749, Train: 0.0000, Val: 0.1667, Test: 0.2000, Std dev: 0.1536
0.3333333333333333 10





In [14]:
# import time
# from torch_geometric.loader import ClusterData, ClusterLoader, NeighborSampler

In [15]:
# sampler_dir = DIR+'ClusterGCNtest/'+DATASET_NAME
# if not os.path.exists(sampler_dir):
#     os.makedirs(sampler_dir)

# num_parts=2

# start_time = time.time()
# cluster_data = ClusterData(data, num_parts=num_parts, recursive=False,save_dir=sampler_dir)
# train_loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True,num_workers=0)
# subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024,shuffle=False, num_workers=0)
# end_time = time.time()

# print(cluster_data)

# Batch Experiments

In [16]:
def batch_experiments(num_run=1):
    
    ALL_DATASETs= [
#         "Roman-empire","Texas","Squirrel","Chameleon",
#         "Cornell","Actor","Wisconsin","Flickr","Amazon-ratings","reed98","amherst41","genius",
        "AmazonProducts",
#         "cornell5","penn94","johnshopkins55",
        "Yelp",
#         "cora","Tolokers","Minesweeper",
#         "CiteSeer","Computers","PubMed","pubmed",
        "Reddit",
#         "cora_ml","dblp",
        "Reddit2",
#         "Cora","CS","Photo","Questions","Physics","citeseer",                
    ]
 
    
#     ALL_DATASETs= ["karate"]
    
    args.log_info = False
    
    for DATASET_NAME in ALL_DATASETs:  
        print(DATASET_NAME, end=' ')
        
        result_file = open("Results/LINKX.txt",'a+')        
        result_file.write(f'{DATASET_NAME} ')
                
        accs = []
        itrs = []
        
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i)   
            
#             if data.num_nodes>100000:
#                 accs.append(-1)
#                 itrs.append(-1)
#                 break
            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 150
            else:
                max_epochs = 50
                              
            accuracy, itr, _ = LINKXperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            
            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        #print(accs, itrs)
        print(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file.write(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
# batch_experiments(num_run=5)