## Get Cuda and Processor information

In [1]:
import sys
sys.path.append('../Submodular')

import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [2]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [3]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    parser = ArgumentParser()
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--recompute', type=bool, default=False)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('-f') ##dummy for jupyternotebook
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

## libraries

In [4]:
import random
import numpy as np
import torch

seed = 123

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
None

In [5]:
import os
import math
import time
from tqdm import tqdm
import torch.nn as nn
from torch_geometric.data import Data, Dataset

## GNN model

In [6]:
import torch_geometric
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv, ChebConv
from torch_geometric.nn import GraphConv, TransformerConv
from torch_geometric.utils import degree
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from ipynb.fs.full.SpatialConv import SpatialConv

### GNN option 1

In [7]:
GNNconv = SAGEConv

In [8]:
class GNNHomophily(torch.nn.Module):
    def __init__(self, num_features,num_classes, hidden_channels=16):
        super().__init__()        
        ##GNN layer
        if(GNNconv==GINConv):
            self.MLP1 = nn.Linear(num_features,hidden_channels)
            self.MLP2 = nn.Linear(hidden_channels,num_classes)
            self.conv1 = GNNconv(self.MLP1)
            self.conv2 = GNNconv(self.MLP2)                
        else:        
            self.conv1 = GNNconv(num_features, hidden_channels)
            self.conv2 = GNNconv(hidden_channels,num_classes)

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x
    
class GNNHeterophily(torch.nn.Module):
    def __init__(self, num_features,num_classes, hidden_channels=16):
        super().__init__()
        self.num_classes = num_classes
             
        self.conv1 = ChebConv(num_features, hidden_channels, K=2, normalization='sym')
        #self.conv2 = GNNconv(hidden_channels,hidden_channels)
        self.conv3 = ChebConv(hidden_channels,num_classes, K=2, normalization='sym')

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index, edge_weight)
#         x = x.relu()
#         x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv3(x, edge_index, edge_weight)
        
        return x
    
class AGSGNN(torch.nn.Module):
    def __init__(self, num_features,num_classes, hidden_channels=16, dropout=0.5):
        super().__init__()
        self.num_classes = num_classes
        
        hidden = int(hidden_channels/2)
        
        self.gnn1 = GNNHomophily(num_features, hidden, hidden_channels)
        self.gnn2 = GNNHomophily(num_features, hidden, hidden_channels)        
        #self.gnn2 = GNNHeterophily(num_features, hidden, hidden_channels)
        self.p = dropout
        self.com_lin = nn.Linear(hidden*2, num_classes)
        
        
#         self.T = 2        
#         self.layer_norm_a1 =  nn.LayerNorm(num_classes)
#         self.layer_norm_s1 =  nn.LayerNorm(num_classes)
        
#         self.alpha_a1 = nn.Linear(num_classes, 1)
#         self.alpha_s1 = nn.Linear(num_classes, 1)
#         self.w1 = nn.Linear(self.T, self.T)
        
        #self.reset_parameters()
            
#     def reset_parameters(self):
#         std_att = 1. / math.sqrt(self.w1.weight.size(1))
#         std_att_vec = 1. / math.sqrt( self.alpha_a1.weight.size(1))
        
#         self.alpha_s1.weight.data.uniform_(-std_att, std_att)
#         self.alpha_i1.weight.data.uniform_(-std_att, std_att)
        
#         self.layer_norm_a1.reset_parameters()
#         self.layer_norm_s1.reset_parameters()        
        
    def forward(self, batch_data):
        
        #out = model(batch_data.x, batch_data.edge_index, batch_data.weight)
        #out = model(batch_data.x, batch_data.edge_index, batch_data.edge_weight)
        #out = model(batch_data.x, batch_data.edge_index)
        
        x1 = self.gnn1(batch_data[0].x, batch_data[0].edge_index)
        #return x1        
        
        x2 = self.gnn2(batch_data[1].x, batch_data[1].edge_index)
        #return x2
        
        a1 = F.relu(x1)
        #a1 = self.layer_norm_a1(a1)
        a1 = F.dropout(a1, p=self.p, training=self.training)
        
        s1 = F.relu(x2)
        #s1 = self.layer_norm_s1(s1)
        s1 = F.dropout(s1, p=self.p, training=self.training)
        
        used = batch_data[0].batch_size
        
        x = torch.cat([a1[:used,:], s1[:used,:]], dim=-1)
        x = self.com_lin(x)
        
        
#         ala1 = torch.sigmoid(self.alpha_a1(a1))
#         als1 = torch.sigmoid(self.alpha_s1(s1))        
        
#         alpha1 = F.softmax(self.w1(torch.cat([ala1, als1],dim=-1)/self.T), dim=1)                
#         x = torch.mm(torch.diag(alpha1[:,0]),a1) + torch.mm(torch.diag(alpha1[:,1]),s1)
        
        #print(x.shape)
        
        return x

## GNN Training and Testing

In [9]:
from torch_geometric.loader import NeighborSampler, NeighborLoader
from ipynb.fs.full.AGSNodeSampler import WeightedNeighborLoader

In [10]:
def test(model, loader, mask, name='Train'):    
    if args.log_info:    
        pbar = tqdm(total=sum(mask).item())
        pbar.set_description(f'Evaluating {name}')
    
    model.eval()
    
    total_correct=0
    total_examples=0
    
    sigmoid = nn.Sigmoid()    
    
    y_true = []
    y_pred = []
    
    with torch.no_grad():                  
    
        for i,batch_data in enumerate(loader):
            
            print(i)
            
            batch_data = [b.to(device) for b in batch_data]
            used = batch_data[0].batch_size
            
            out = model(batch_data)
                   
            out=out[:used,:]
            pred = out.argmax(dim=1)            

            y_true.append(batch_data[0].y[:used].detach().cpu().numpy())
            y_pred.append(pred.detach().cpu().numpy())
            
            if args.log_info:
                pbar.update(used)
              
    if args.log_info:
        pbar.close()
    
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    
    acc = accuracy_score(y_true, y_pred)
                    
    return acc

In [11]:
def train(DATASET_NAME, model, data, epochs=100, train_neighbors=[-1,10], test_neighbors=[-1,10]):
    
    if args.log_info:
        print("Train neighbors: ", train_neighbors)
        print("Test neighbors: ", test_neighbors)
        
#     optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    optimizer = torch.optim.Adam(model.parameters())
    
    if data.y.ndim == 1:
        criterion = torch.nn.CrossEntropyLoss()
    else:
        criterion = torch.nn.BCEWithLogitsLoss()
    
    batch_size=512         
    worker = 0
    
    if data.num_nodes>=50000:
        worker = 8
    else:
        worker = min(8,int(sum(data.train_mask)/batch_size))
        
    if args.log_info:
        print("Worker: ", worker)
        
#     weight_func=['knn','submodular']; 
#     weight_func=['fastlink','fastlink']; 
    weight_func=['random', 'random'];  worker = 0;
#     weight_func=['fastlink', 'fastlink'];  #worker = 2;
    params={
        'knn':{'metric':'cosine'},
        'submodular':{'metric':'cosine'},
        'link-nn':{'value':'min'},
        'link-sub':{'value':'max'},
    }    
    
    sampler_dir = DIR+'AGSGIN2struc/'+DATASET_NAME
#     if not os.path.exists(sampler_dir):
#         os.makedirs(sampler_dir)
    
    start = time.time()    
    loader = WeightedNeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
                              batch_size=batch_size, shuffle=True, num_workers=worker, drop_last=False, 
                              weight_func=weight_func, params=params, log=args.log_info,
                                    directed=False, replace = True,
                                    save_dir = sampler_dir,recompute = args.recompute)

    train_loader = WeightedNeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
                              batch_size=batch_size, shuffle=False, num_workers=worker, drop_last=False, 
                              weight_func=weight_func, params=params, log=args.log_info,
                                          directed=False, replace = True,
                                          save_dir = sampler_dir,recompute = False)
    
    val_loader = WeightedNeighborLoader(data, input_nodes=data.val_mask,num_neighbors=test_neighbors, 
                              batch_size=batch_size, shuffle=False, num_workers=min(8,int(sum(data.val_mask)/batch_size)), 
                                        drop_last=False, 
                              weight_func=weight_func, params=params,log=args.log_info, directed=False, replace = True,
                                        save_dir = sampler_dir,recompute = False)
    
    test_loader = WeightedNeighborLoader(data, input_nodes=data.test_mask,num_neighbors=test_neighbors, 
                              batch_size=batch_size, shuffle=False, num_workers=min(8,int(sum(data.test_mask)/batch_size)), drop_last=False, 
                              weight_func=weight_func, params=params, log=args.log_info, directed=False, replace = True,
                                         save_dir = sampler_dir,recompute = False)
    
    top_k_accs = []    
    best_acc=0  
    
    train_losses=[]
    val_accuracies=[]
    train_accuracies=[]
    test_accuracies=[]
    
    num_iteration = epochs
    
    end = time.time()
    if args.log_info:
        print("Total initialization time: ", end-start)
    
    start = time.time()
    
    for epoch in range(1,epochs+1):
        
        
        
#         if args.log_info:
#             pbar = tqdm(total=int(sum(data.train_mask)))
#             pbar.set_description(f'Epoch {epoch:02d}')
        
#         model.train()
#         total_loss = total_examples = 0
        
#         for i,batch_data in enumerate(loader):            
#             #print(batch_data)
            
#             batch_data = [b.to(device) for b in batch_data]
#             used = batch_data[0].batch_size #int(sum(batch_data.train_mask))       
            
#             optimizer.zero_grad()            
#             out = model(batch_data)
#             #out = F.log_softmax(out, dim=1)                 
#             #loss = F.nll_loss(out[batch_data[0].train_mask], batch_data[0].y[batch_data[0].train_mask])
#             #loss = F.cross_entropy(out[:used], batch_data[0].y[:used])
#             loss = criterion(out[:used], batch_data[0].y[:used])
            
#             loss.backward()
#             optimizer.step()
                        
#             total_loss += loss.item() * used
#             total_examples += used
            
#             if args.log_info:
#                 pbar.update(used)
#         if args.log_info:
#             pbar.close()
        
    
        loss = 0
        if epoch%10 == 0:        
        #if True:
            loss=total_loss / total_examples
            train_losses.append(loss)

            #print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}', end = ', ')                

            if args.log_info:
                train_acc=test(model, train_loader,data.train_mask,'Train')            
                train_accuracies.append(train_acc.item())        
            else:
                train_acc = 0 ; train_accuracies.append(train_acc)

            if args.log_info:
                val_acc = test(model, val_loader,data.val_mask,'Validation')
                val_accuracies.append(val_acc.item())
            else:
                val_acc = 0 ; val_accuracies.append(val_acc)

            test_acc = test(model, test_loader,data.test_mask,'Test')
            test_accuracies.append(test_acc.item())
            #print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}')

            std_dev = np.std(train_losses[-5:])
            #print(f'Epoch: {epoch:03d}, Std dev: {std_dev:.4f}')

            if args.log_info:
                print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}, Std dev: {std_dev:.4f}')

#             if epoch>=5 and std_dev<=1e-3:
#                 num_iteration = epoch

#                 if args.log_info:                
#                     print("Iteration for convergence: ", epoch)
#                 break
        
    if args.log_info:
        #save_plot([val_accuracies], labels=['Validation'], name='Plots/Validation', yname='Accuracy', xname='Epoch')    
        save_plot([train_losses, train_accuracies, val_accuracies, test_accuracies], labels=['Loss','Train','Validation','Test'], name='Results/AGSNSVal', yname='Accuracy', xname='Epoch')
        
        print ("Best Validation Accuracy, ",max(val_accuracies))
        print ("Best Test Accuracy, ",max(test_accuracies))
        
    best_acc = max(test_accuracies)
    
    end = time.time()
    if args.log_info:
        print("Total epoch time: ", end-start)    
    
    return best_acc, num_iteration

In [12]:
def AGSNSperformanceSampler(DATASET_NAME, data, dataset, num_classes, epochs=1, train_neighbors=[-1,-1], test_neighbors=[-1,-1]):        
    
    model = AGSGNN(data.x.shape[1], num_classes, hidden_channels=64).to(device)
    
    if args.log_info: print(model)    
    
    best_acc, num_iteration = train(DATASET_NAME, model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    
    return best_acc, num_iteration, model

In [13]:
def adj_feature(data):    
    adj_mat = torch.zeros((data.num_nodes,data.num_nodes))
    edges = data.edge_index.t()
    adj_mat[edges[:,0], edges[:,1]] = 1
    return adj_mat

# adj_feature(data)
# data.x.shape

In [14]:
from torch_geometric.utils import add_self_loops

In [15]:
args.log_info = True
DATASET_NAME = 'wiki'
data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=True, split_no=0, random_state=0); print("")
print(data)

# (row, col) = data.edge_index
# data.edge_index = torch.stack((torch.cat((row, col),dim=0),torch.cat((col, row),dim=0)),dim=0)
# data.edge_index = torch_geometric.utils.coalesce(data.edge_index)
# print(data)

args.recompute = False

if len(data.y.shape) > 1:
    data.y = data.y.argmax(dim=1)        
    num_classes = torch.max(data.y).item()+1
else:
    num_classes = dataset.num_classes

if num_classes!= torch.max(data.y)+1:
    num_classes = torch.max(data.y).item()+1
    
# data.edge_index, _ = add_self_loops(data.edge_index)            
# data.x = torch.cat((data.x, adj_feature(data)), dim=1)
# if args.log_info == True:
#     print(data.x.shape)

# if DATASET_NAME in ['Cornell', 'cornell5']:
#     data.edge_index, _ = add_self_loops(data.edge_index)            
    
# if DATASET_NAME in ['Squirrel', 'Chameleon', 'amherst41',
#                     'Cornell','cornell5', 'johnshopkins55']:
#     data.x = torch.cat((data.x, adj_feature(data)), dim=1)
#     if args.log_info == True:
#         print(data.x.shape)

best_acc, num_iteration, _ =  AGSNSperformanceSampler(DATASET_NAME, data, dataset, num_classes, epochs=50, train_neighbors=[8,4], test_neighbors=[8,4])
print(best_acc, num_iteration)

N  1925342  E  303434860  d  157.6004990282246 0.2775627076625824 0.3893907368183136 -1 -0.07123160362243652 
Data(x=[1925342, 600], edge_index=[2, 303434860], y=[1925342], train_mask=[1925342], val_mask=[1925342], test_mask=[1925342])
AGSGNN(
  (gnn1): GNNHomophily(
    (conv1): SAGEConv(600, 64)
    (conv2): SAGEConv(64, 32)
  )
  (gnn2): GNNHomophily(
    (conv1): SAGEConv(600, 64)
    (conv2): SAGEConv(64, 32)
  )
  (com_lin): Linear(in_features=64, out_features=6, bias=True)
)
Train neighbors:  [8, 4]
Test neighbors:  [8, 4]
Worker:  8


TypeError: exceptions must derive from BaseException

# Batch Experiments

In [None]:
def batch_experiments(num_run=1):
    
    ALL_DATASETs= [
        'snap-patents',
        'wiki',
        'genius',
        'pokec',
        'Yelp',
        'twitch-gamer',
        'arxiv-year',        
        'AmazonProducts',   
        'Yelp',
        'Reddit',
        'Reddit2',
    ]     
    
    ALL_DATASETs= [        
        'wiki',
        'genius',
        'pokec',
        'Yelp',
        'twitch-gamer',
        'arxiv-year',        
        'AmazonProducts',        
    ]

    args.log_info = False
    
    filename = "Results/AGS-NS-GIN2.txt"
    
    for DATASET_NAME in ALL_DATASETs:  
        print(DATASET_NAME, end=' ')
        
        
        result_file = open(filename,'a+')        
        result_file.write(f'{DATASET_NAME} ')
        result_file.close()
                
        accs = []
        itrs = []
                
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i, random_state=i)   
            
#             #optional for making undirected graph
#             (row, col) = data.edge_index
#             data.edge_index = torch.stack((torch.cat((row, col),dim=0),torch.cat((col, row),dim=0)),dim=0)
#             data.edge_index = torch_geometric.utils.coalesce(data.edge_index)
                    
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 150
            else:
                max_epochs = 20
                
            if DATASET_NAME in ['Squirrel', 'Chameleon','cornell5','penn94','johnshopkins55']:
                data.x = torch.cat((data.x, adj_feature(data)), dim=1)
                if args.log_info == True:
                    print(data.x.shape)
                              
            accuracy, itr, _ = AGSNSperformanceSampler(DATASET_NAME, data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            
            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        #print(accs, itrs)
        print(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file = open(filename,'a+')
        result_file.write(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
# batch_experiments(num_run=1)

## View Learned Representation

In [None]:
# if __name__ == '__main__':    
    
#     n=7
#     x = torch.Tensor([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
#     y = torch.LongTensor([0,0,0, 1, 1, 1, 1])
#     edge_index = torch.LongTensor([[1,2],[1,4],[1,5],[2,1],[3,6],[3,7],[4,5],[4,1],[4,6],[4,7],[5,1],[5,4],[5,6],[6,3],[6,4],[6,5],[6,7],[7,3],[7,4],[7,6]]).T
#     edge_index = edge_index-1
    
#     mask = torch.zeros(n, dtype=torch.bool)
#     mask[[1,3]] = True
    
#     test_data = Data(x = x, y = y, edge_index = edge_index, train_mask = mask, test_mask = mask, val_mask = mask)    
#     print(test_data)
    
    
#     None

In [None]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

In [None]:
# model.eval()
# #X = model(data.x.to(device),data.edge_index.to(device), data.weight.to(device))
# X = model(data.x.to(device),data.edge_index.to(device))
# X = X.detach().to('cpu')
# y = data.y.to('cpu')
# X.shape

In [None]:
# plt.figure(figsize=(10, 10))

# # Create a t-SNE model with 2 components and a perplexity of 30
# tsne = TSNE(n_components=2, perplexity=30, random_state=42, learning_rate='auto', init='random')

# # Fit and transform the data to the 2D t-SNE space
# X_tsne = tsne.fit_transform(X)

# # Plot the data in the 2D t-SNE space, colored by class
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()