In [53]:
import os
import sys
if not os.getcwd().endswith("Submodular"):
    sys.path.append('../Submodular')

In [54]:
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [55]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [56]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    parser = ArgumentParser()
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('-f') ##dummy for jupyternotebook
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

In [57]:
import multiprocessing
import pandas as pd
import os
from tqdm import tqdm
import torch
import argparse
from argparse import ArgumentParser
from pathlib import Path
import torch.nn.functional as F
import torch_geometric

In [58]:
import random
import numpy as np
import torch

seed = 123

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
None

# Dataset

In [59]:
import torch.nn as nn
import numpy as np
from torch.nn import init
from random import shuffle, randint
import torch.nn.functional as F
from torch_geometric.datasets import Reddit, PPI, Planetoid
from itertools import combinations, combinations_with_replacement
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
#from torch_geometric.data import NeighborSampler
import matplotlib.pyplot as plt
import sys
from torch_geometric.data import Data
import logging
import time
import copy
import math
from torch_sparse import SparseTensor

from torch_geometric.loader import NeighborSampler, NeighborLoader
from ipynb.fs.full.AGSNodeSampler import WeightedNeighborLoader
from torch_geometric.utils import degree

In [60]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

## GNNmodel

In [61]:
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv
from torch_geometric.nn import GraphConv, TransformerConv
from torch_geometric.utils import degree
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from ipynb.fs.full.SpatialConv import SpatialConv

In [62]:
# test = AGS_GCN(2, 2)
# #print(test)
# n=7
# x = torch.Tensor([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
# y = torch.LongTensor([0,0,0, 1, 1, 1, 1])
# edge_index = torch.LongTensor([[1,2],[1,4],[1,5],[2,1],[3,6],[3,7],[4,5],[4,1],[4,6],[4,7],[5,1],[5,4],[5,6],[6,3],[6,4],[6,5],[6,7],[7,3],[7,4],[7,6]]).T
# edge_index = edge_index-1
# test(x,edge_index)

In [63]:
# import layers
# import scipy.sparse as sp
# from ipynb.fs.full.ACM.models.Test import GCN, normalize_tensor, sparse_mx_to_torch_sparse_tensor
# from torch_geometric.utils.convert import to_scipy_sparse_matrix

In [64]:
# test2  = GCN(nfeat=2,
#         nhid=2,
#         nclass=2,
#         nlayers=2,
#         nnodes=7,
#         dropout=0.2,
#         model_type='acmgcn',
#         structure_info=0,
#         variant=False,
#         init_layers_X=1,)

# test2 = test2.to(device)
# test2

In [65]:
#import ACM.modelgeom.layers as layers
from ACM.modelgeom.models import GCN
import scipy.sparse as sp
from torch_geometric.utils.convert import to_scipy_sparse_matrix
from ipynb.fs.full.ACM.models.Test import normalize_tensor, sparse_mx_to_torch_sparse_tensor

In [66]:
# test2  = GCN(nfeat=2,
#         nhid=2,
#         nclass=2,
#         nlayers=2,
#         nnodes=7,
#         dropout=0.2,
#         model_type='acmgcn',
#         structure_info=0,
#         variant=False,)

# test2 = test2.to(device)
# test2

In [67]:
# adj_low_unnormalized = to_scipy_sparse_matrix(edge_index)
# adj_low = normalize_tensor(sp.identity(n) + adj_low_unnormalized)
# adj_high = sp.identity(n) - adj_low
# adj_low = sparse_mx_to_torch_sparse_tensor(adj_low).to(device)
# adj_high = sparse_mx_to_torch_sparse_tensor(adj_high).to(device)
# adj_low_unnormalized = sparse_mx_to_torch_sparse_tensor(adj_low_unnormalized).to(device)

# test2(x.to(device), adj_low, adj_high, adj_low_unnormalized)

In [68]:
def ACMtest(model, data, mask, x, adj_low, adj_high, adj_low_unnormalized):
    
    model.eval()
    
    total_correct=0
    total_examples=0
    
    with torch.no_grad():                  
    
        out = model(x, adj_low, adj_high, adj_low_unnormalized)  
        out = F.log_softmax(out,dim=1)
        pred =out[mask].argmax(dim=1)            
        
        correct = pred.eq(data.y[mask].to(device))
        total_correct+=correct.sum()            

        total_examples += sum(mask)

    #print("Total tested: ", total_examples,end=', ')

    return total_correct/total_examples

    

def ACMtrain(model, data, epochs=100, train_neighbors=[-1,10], test_neighbors=[-1,10]):
    
    if args.log_info:
        print("Train neighbors: ", train_neighbors)
        print("Test neighbors: ", test_neighbors)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    if data.y.ndim == 1:
        #criterion = torch.nn.CrossEntropyLoss()
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.BCEWithLogitsLoss()
        
    
    row, col = data.edge_index
    data.edge_weight = 1. / degree(col, data.num_nodes)[col]  # Norm by in-degree.
    
    train_losses=[]
    val_accuracies=[]
    train_accuracies=[]
    test_accuracies=[]
    
    
    data = data.to(device)
    n = data.num_nodes
    x = data.x.to(device)
    adj_low_unnormalized = to_scipy_sparse_matrix(data.edge_index)
    adj_low = normalize_tensor(sp.identity(n) + adj_low_unnormalized)
    adj_high = sp.identity(n) - adj_low
    adj_low = sparse_mx_to_torch_sparse_tensor(adj_low).to(device)
    adj_high = sparse_mx_to_torch_sparse_tensor(adj_high).to(device)
    adj_low_unnormalized = sparse_mx_to_torch_sparse_tensor(adj_low_unnormalized).to(device)
    
    if args.log_info:
        print(x.device, adj_low.device, adj_high.device, adj_low_unnormalized.device)
    
    num_iteration = epochs
    best_acc = 0
    for epoch in range(1,epochs+1):
        
        model.train()
        total_loss = total_examples = 0
        
        optimizer.zero_grad()
                
        out = model(x, adj_low, adj_high, adj_low_unnormalized)
    
        #loss = F.nll_loss(out[batch_data.train_mask], batch_data.y[batch_data.train_mask])
        out = F.log_softmax(out,dim=1)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * int(sum(data.train_mask))
        total_examples += int(sum(data.train_mask))
        loss=total_loss / total_examples
        train_losses.append(loss)
        
        model.eval()
        with torch.no_grad():            
            out = model(x, adj_low, adj_high, adj_low_unnormalized)
            #out = F.log_softmax(out,dim=1)        
            pred = out.argmax(dim=-1)
            correct = pred.eq(data.y)

        accs = []
        for _, mask in data('train_mask', 'val_mask', 'test_mask'):
            accs.append(correct[mask].sum().item() / mask.sum().item()) 
            
        #print(accs)
        train_accuracies.append(accs[0])
        val_accuracies.append(accs[1])
        test_accuracies.append(accs[2])
        std_dev = np.std(train_losses[-5:])
        
        if args.log_info:
            print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {accs[0]:.4f}, Val: {accs[1]:.4f}, Test: {accs[2]:.4f}, Std dev: {std_dev:.4f}')
        
        else:
            if accs[2]>best_acc:
                best_acc=accs[2]
                
#             if epoch>=5 and std_dev<=1e-3:
#                 num_iteration = epoch            
#                 if args.log_info:                
#                     print("Iteration for convergence: ", epoch)
#                 break
                
    
#         train_acc = ACMtest(model, data, data.train_mask, x, adj_low, adj_high, adj_low_unnormalized)
#         train_accuracies.append(train_acc.item())

#         val_acc = ACMtest(model, data, data.val_mask, x, adj_low, adj_high, adj_low_unnormalized)
#         val_accuracies.append(val_acc.item())
        
#         test_acc = ACMtest(model, data, data.test_mask, x, adj_low, adj_high, adj_low_unnormalized)
#         test_accuracies.append(test_acc.item())
        
#         std_dev = np.std(train_losses[-5:])
        
#         print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}, Std dev: {std_dev:.4f}')
    
    if args.log_info:
        save_plot([train_losses, train_accuracies, val_accuracies, test_accuracies], labels=['Loss','Train','Validation','Test'], name='Results/ACMVal', yname='Accuracy', xname='Epoch')

        print ("Best Validation Accuracy, ",max(val_accuracies))
        print ("Best Test Accuracy, ",max(test_accuracies))    
        return max(test_accuracies), 0
    else:
        return best_acc, num_iteration

In [69]:
def ACMperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[-1,10], test_neighbors=[-1,10]):        
    
#     model  = GCN(
#         nfeat=dataset.num_features,
#         nhid=64,
#         nclass=num_classes,
#         nlayers=2,
#         nnodes=data.num_nodes,
#         dropout=0.2,
#         model_type='acmsnowball',
#         structure_info=0,
#         variant=False,
#         init_layers_X=1,)

    #acmgcnpp acmgcn acmgcnp acmsgc acmsnowball

    model  = GCN(
        nfeat=dataset.num_features,
        nhid=64,
        nclass=num_classes,
        nlayers=2,
        nnodes=data.num_nodes,
        dropout=0.2,
        model_type='acmgcnp',
        structure_info=1,
        variant=True,)
    
    model = model.to(device)
    
    if args.log_info:
        print(model)
        
    best_acc, num_iteration = ACMtrain(model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)
    
    return best_acc, num_iteration, model

# Main

In [70]:
# args.log_info = False

# data, dataset = get_data('karate', DIR=None, log=False, h_score=True, split_no=0); print("")
# # # data = generate_synthetic(data, d=42, h=0.2, train=0.6, random_state=1, log=True)
# # # data.x = F.one_hot(data.y).float()
# # data
# best_acc, num_iteration, _ = ACMperformanceSampler(data, dataset, dataset.num_classes, epochs=150, train_neighbors=[-1,-1], test_neighbors=[-1,-1])
# print(best_acc, num_iteration)

# Experiments

In [71]:
def batch_experiments(num_run=1):
    
    ALL_DATASETs= [
        "Cornell",
        "Texas",
        "Wisconsin",
        "reed98",
        "amherst41",
        "penn94",
        "Roman-empire",
        "cornell5",
        "Squirrel",
        "johnshopkins55",
        "AmazonProducts",
        "Actor",
        "Minesweeper",
        "Questions",
        "Chameleon",
        "Tolokers",
        "Flickr",
        "Yelp",
        "Amazon-ratings",
        "genius",
        "cora",
        "CiteSeer",
        "dblp",
        "Computers",
        "PubMed",
        "pubmed",
        "Reddit",
        "cora_ml",
        "Cora",
        "Reddit2",
        "CS",
        "Photo",
        "Physics",
        "citeseer"
    ]
 
    
    #ALL_DATASETs= ["karate"]
    
    args.log_info = False
    
    for DATASET_NAME in ALL_DATASETs:  
        print(DATASET_NAME, end=' ')
        
        result_file = open("Results/ACMgcnp.txt",'a+')        
        result_file.write(f'{DATASET_NAME} ')
        
        
        accs = []
        itrs = []
        
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i)            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
            
            if data.num_nodes>100000:
                accuracy, itr = -1, -1
            else:
                accuracy, itr, _ =  ACMperformanceSampler(data, dataset, num_classes, epochs=150, train_neighbors=[8,4], test_neighbors=[8,4])
            
            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        #print(accs, itrs)
        print(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file.write(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
# batch_experiments(num_run=5)

In [72]:
from ipynb.fs.full.Dataset import generate_synthetic2homophily
import torch_geometric.utils.homophily as homophily

## comparison

In [74]:
def ablation(num_run = 1):
    
    #SYN_NAME = random.randint(0,1000)

    ALL_DATASETs= [
        "Tolokers","Computers","Photo"
    ]
    
    ALL_DATASETs= ["Cora"]
    
    args.log_info = False
    
    filename = "Results/AGM-GCNablation.txt"
    
    for DATASET_NAME in ALL_DATASETs:  
        
        random_state = 10
        
        print(DATASET_NAME,"-",random_state, end=' ')
        
        
        result_file = open(filename,'a+')        
        result_file.write(f'{DATASET_NAME} ')
        result_file.close()
                
        accs = []
        itrs = []
                
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i)   
            
            d = 100
            h =0.05
            train=0.3
            balance=True
            h2 = 0.25
            ratio = 0.50
                                    
#             global data_filename_extension
#             data_filename_extension = str(d)+str(h)+str(train)+str(random_state)+str(balance)+'.weight'            
#             data_filename = DIR+'AGSGNNstruc/'+DATASET_NAME+str(d)+str(h)+str(train)+str(random_state)+str(balance)
            
#             if os.path.exists(data_filename):
#                 data = torch.load(data_filename)                
#                 print("loaded "+data_filename)
#             else:
#                 data = generate_synthetic(data, d=d, h=h, train=train, random_state=random_state, log=False, balance=balance)
# #                 data = generate_synthetic(data, d=d, h=h, train=train, random_state=random_state, log=False)
#                 torch.save(data,data_filename)
#                 print("saved "+data_filename)
        
            global data_filename_extension
            data_filename_extension = str(d)+str(h)+str(h2)+str(ratio)+str(train)+str(random_state)+str(balance)+'.weight'            
            data_filename = DIR+'AGSGNNstruc/'+DATASET_NAME+str(d)+str(h)+str(h2)+str(ratio)+str(train)+str(random_state)+str(balance)
            
            if os.path.exists(data_filename):
                data = torch.load(data_filename)                
                print("loaded "+data_filename)
            else:
                data = generate_synthetic2homophily(data, d=d, h1=h, h2=h2, ratio=ratio, train=train, random_state=random_state, log=False, balance=balance)                 
                torch.save(data,data_filename)
                print("saved "+data_filename)
    
            ##Sparsifiy
            #data = random_sparsify(data, 13, log = True)
#             data = sparsify(data, log = True, method = 'submodular', metric= 'cosine')
                        
#             data1 = sparsify(copy.deepcopy(data), log = True, method = 'submodular', metric= 'cosine')
#             data = sparsify(data, log = True, method = 'nn', metric= 'cosine')                         
#             data.edge_index = torch.cat((data.edge_index, data1.edge_index), dim=1)
            
            #optional for making undirected graph
            (row, col) = data.edge_index
            data.edge_index = torch.stack((torch.cat((row, col),dim=0),torch.cat((col, row),dim=0)),dim=0)
            data.edge_index = torch_geometric.utils.coalesce(data.edge_index)
            
            if True:
                print("Node Homophily:", homophily(data.edge_index, data.y, method='node'))
                print("Edge Homophily:", homophily(data.edge_index, data.y, method='edge'))
                print("Edge_insensitive Homophily:", homophily(data.edge_index, data.y, method='edge_insensitive'))    
                print("Degree: ", data.num_edges / data.num_nodes)

            
#             if data.num_nodes>100000:
#                 accs.append(-1)
#                 itrs.append(-1)
#                 break
            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 250
            else:
                max_epochs = 20
                
            if DATASET_NAME in ['Squirrel', 'Chameleon','cornell5','penn94','johnshopkins55','amherst41']:
                data.x = torch.cat((data.x, adj_feature(data)), dim=1)
                if args.log_info == True:
                    print(data.x.shape)
                              
            accuracy, itr, _ = ACMperformanceSampler(data, dataset, num_classes, epochs=250, train_neighbors=[8,4], test_neighbors=[8,4])

            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        print(accs, itrs)
        print(f'acc {np.mean(accs)*100:0.4f} \pm {np.std(accs)*100:0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file = open(filename,'a+')
        result_file.write(f'acc {np.mean(accs)*100:0.4f} \pm {np.std(accs)*10:0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
    return 

st_time = time.time()
ablation(num_run=5)
en_time = time.time()

print("Runtime: ", en_time-st_time)

Cora - 10 loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True
Node Homophily: 0.14723201096057892
Edge Homophily: 0.14734117686748505
Edge_insensitive Homophily: 0.005595000926405191
Degree:  191.96349206349205
loaded /scratch/gilbreth/das90/Dataset/AGSGNNstruc/Cora1000.050.250.50.310True