In [18]:
import os
import sys
if not os.getcwd().endswith("Submodular"):
    sys.path.append('../Submodular')    

In [19]:
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [20]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [21]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    parser = ArgumentParser()
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    #parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('--use_normalization', action='store_true')    
    parser.add_argument('-f') ##dummy for jupyternotebook
    
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

In [22]:
import os.path as osp
import torch
import torch.nn.functional as F
from torch_geometric.datasets import LINKXDataset
from torch_geometric.nn import LINKX
import numpy as np
from tqdm import tqdm
from torch_geometric.loader import NeighborSampler, NeighborLoader

In [23]:
def train(model, data, epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-3)    
    data = data.to(device)
    
    train_losses=[]
    best_acc = 0 
    num_iteration = epochs
    
    for epoch in range(1,epochs+1):        
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        mask = data.train_mask  # Use the first set of the five masks.
        loss = F.cross_entropy(out[mask], data.y[mask])
        loss.backward()
        optimizer.step()                            
        
        total_loss = loss.item()
        train_losses.append(total_loss)
        
        
        with torch.no_grad():
            accs = []
            model.eval()
            pred = model(data.x, data.edge_index).argmax(dim=-1)
            for _, mask in data('train_mask', 'val_mask', 'test_mask'):
                mask = mask # Use the first set of the five masks.
                accs.append(int((pred[mask] == data.y[mask]).sum()) / int(mask.sum()))        
                
        train_acc, val_acc, test_acc = accs[0], accs[1], accs[2]
        
        if test_acc>best_acc:
            best_acc=test_acc
        
        std_dev = np.std(train_losses[-5:])
        
        if args.log_info:
            print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}, Std dev: {std_dev:.4f}')
                
#         if epoch>=5 and std_dev<=1e-3:
#             num_iteration = epoch
            
#             if args.log_info:                
#                 print("Iteration for convergence: ", epoch)
#             break
                
    return best_acc, num_iteration

In [24]:
def test(model, loader, mask, name='Train'):
    
    if args.log_info:
        pbar = tqdm(total=sum(mask).item())
        pbar.set_description(f'Evaluating {name}')

    model.eval()
    
    total_correct=0
    total_examples=0
    
    with torch.no_grad():                  
    
        for i,batch_data in enumerate(loader):
            batch_data = batch_data.to(device)
            out = model(batch_data.x, batch_data.edge_index,batch_data.edge_weight)
            out=out[:batch_data.batch_size,:]
            pred = out.argmax(dim=-1)            
            correct = pred.eq(batch_data.y[:batch_data.batch_size].to(device))

            total_correct+=correct.sum()
            total_examples+=batch_data.batch_size
            
            if args.log_info:
                pbar.update(batch_data.batch_size)
    
    if args.log_info:
        pbar.close()

    return total_correct.item()/total_examples

In [25]:
def LINKXperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[8,4], test_neighbors=[8,4]):        
    model = LINKX(data.num_nodes, data.num_features, hidden_channels=128,
              out_channels= num_classes, num_layers=1,
              num_edge_layers=1, num_node_layers=1, dropout=0.5).to(device)

    if args.log_info:
        print(model) 
    
    best_acc, num_iteration = train(model, data, epochs)        
    return best_acc, num_iteration, model

In [26]:
methods = ['link', 'gcn', 'mlp', 'cs', 'sgc', 
           'gprgnn', 'appnp', 'gat', 'lp', 
           'mixhop','gcnjk','gatjk','h2gcn',
           'link_concat','linkx','gcn2']

#others = ['gsage','gsaint','acmgcn','clustergcn','gcn','gin','gat','linkx']

In [27]:
# args.log_info = True
# DATASET_NAME = 'karate'
# data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=True, split_no=0); print("")
# print(data)
# best_acc, num_iteration, _ = LINKXperformanceSampler(data, dataset, dataset.num_classes, epochs=10, train_neighbors=[8,4], test_neighbors=[8,4])
# print(best_acc, num_iteration)

# Batch Experiments

In [28]:
def batch_experiments(num_run=1):
    
    ALL_DATASETs= [
        "Cornell",
        "Texas",
        "Wisconsin",
        "reed98",
        "amherst41",
        "penn94",
        "Roman-empire",
        "cornell5",
        "Squirrel",
        "johnshopkins55",
        "AmazonProducts",
        "Actor",
        "Minesweeper",
        "Questions",
        "Chameleon",
        "Tolokers",
        "Flickr",
        "Yelp",
        "Amazon-ratings",
        "genius",
        "cora",
        "CiteSeer",
        "dblp",
        "Computers",
        "PubMed",
        "pubmed",
        "Reddit",
        "cora_ml",
        "Cora",
        "Reddit2",
        "CS",
        "Photo",
        "Physics",
        "citeseer"
    ]
 
    
#     ALL_DATASETs= ["karate"]
    
    args.log_info = False
    
    for DATASET_NAME in ALL_DATASETs:  
        print(DATASET_NAME, end=' ')
        
        result_file = open("Results/LINKX.txt",'a+')        
        result_file.write(f'{DATASET_NAME} ')
                
        accs = []
        itrs = []
        
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i)   
            
            if data.num_nodes>100000:
                accs.append(-1)
                itrs.append(-1)
                break
            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 150
            else:
                max_epochs = 50
                              
            accuracy, itr, _ = LINKXperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            
            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        #print(accs, itrs)
        print(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file.write(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
# batch_experiments(num_run=5)

In [29]:
import time
from ipynb.fs.full.Dataset import generate_synthetic2homophily
import torch_geometric.utils.homophily as homophily
import torch_geometric

In [30]:
def ablation(num_run = 1):
    
    #SYN_NAME = random.randint(0,1000)

    ALL_DATASETs= [
        "Tolokers","Computers","Photo"
    ]
    
    ALL_DATASETs= ["Cora"]
    
    args.log_info = False
    
    filename = "Results/LINKX-GCNablation.txt"
    
    for DATASET_NAME in ALL_DATASETs:  
        
        random_state = 10
        
        print(DATASET_NAME,"-",random_state, end=' ')
        
        
        result_file = open(filename,'a+')        
        result_file.write(f'{DATASET_NAME} ')
        result_file.close()
                
        accs = []
        itrs = []
                
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i)   
            
            d = 100
            h =0.05
            train=0.3
            balance=True
            h2 = 0.25
            ratio = 0.50
                                    
#             global data_filename_extension
#             data_filename_extension = str(d)+str(h)+str(train)+str(random_state)+str(balance)+'.weight'            
#             data_filename = DIR+'AGSGNNstruc/'+DATASET_NAME+str(d)+str(h)+str(train)+str(random_state)+str(balance)
            
#             if os.path.exists(data_filename):
#                 data = torch.load(data_filename)                
#                 print("loaded "+data_filename)
#             else:
#                 data = generate_synthetic(data, d=d, h=h, train=train, random_state=random_state, log=False, balance=balance)
# #                 data = generate_synthetic(data, d=d, h=h, train=train, random_state=random_state, log=False)
#                 torch.save(data,data_filename)
#                 print("saved "+data_filename)
        
            global data_filename_extension
            data_filename_extension = str(d)+str(h)+str(h2)+str(ratio)+str(train)+str(random_state)+str(balance)+'.weight'            
            data_filename = DIR+'AGSGNNstruc/'+DATASET_NAME+str(d)+str(h)+str(h2)+str(ratio)+str(train)+str(random_state)+str(balance)
            
            if os.path.exists(data_filename):
                data = torch.load(data_filename)                
                print("loaded "+data_filename)
            else:
                data = generate_synthetic2homophily(data, d=d, h1=h, h2=h2, ratio=ratio, train=train, random_state=random_state, log=False, balance=balance)                 
                torch.save(data,data_filename)
                print("saved "+data_filename)
    
            ##Sparsifiy
            #data = random_sparsify(data, 13, log = True)
#             data = sparsify(data, log = True, method = 'submodular', metric= 'cosine')
                        
#             data1 = sparsify(copy.deepcopy(data), log = True, method = 'submodular', metric= 'cosine')
#             data = sparsify(data, log = True, method = 'nn', metric= 'cosine')                         
#             data.edge_index = torch.cat((data.edge_index, data1.edge_index), dim=1)
            
            #optional for making undirected graph
            (row, col) = data.edge_index
            data.edge_index = torch.stack((torch.cat((row, col),dim=0),torch.cat((col, row),dim=0)),dim=0)
            data.edge_index = torch_geometric.utils.coalesce(data.edge_index)
            
            if True:
                print("Node Homophily:", homophily(data.edge_index, data.y, method='node'))
                print("Edge Homophily:", homophily(data.edge_index, data.y, method='edge'))
                print("Edge_insensitive Homophily:", homophily(data.edge_index, data.y, method='edge_insensitive'))    
                print("Degree: ", data.num_edges / data.num_nodes)

            
#             if data.num_nodes>100000:
#                 accs.append(-1)
#                 itrs.append(-1)
#                 break
            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 250
            else:
                max_epochs = 20
                
            if DATASET_NAME in ['Squirrel', 'Chameleon','cornell5','penn94','johnshopkins55','amherst41']:
                data.x = torch.cat((data.x, adj_feature(data)), dim=1)
                if args.log_info == True:
                    print(data.x.shape)
                              
            accuracy, itr, _ = LINKXperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])

            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        print(accs, itrs)
        print(f'acc {np.mean(accs)*100:0.4f} \pm {np.std(accs)*100:0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file = open(filename,'a+')
        result_file.write(f'acc {np.mean(accs)*100:0.4f} \pm {np.std(accs)*10:0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
    return 

st_time = time.time()
ablation(num_run=5)
en_time = time.time()

print("Runtime: ", en_time-st_time)

Cora - 10 loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True