## Get Cuda and Processor information

In [40]:
import sys
sys.path.append('../Submodular')

import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [41]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [42]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    parser = ArgumentParser()
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('-f') ##dummy for jupyternotebook
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

## libraries

In [43]:
import random
import numpy as np
import torch

seed = 123

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
None

In [44]:
import os
import math
import time
from tqdm import tqdm
import torch.nn as nn
from torch_geometric.data import Data, Dataset

## GNN model

In [45]:
import torch_geometric
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv, ChebConv
from torch_geometric.nn import GraphConv, TransformerConv
from torch_geometric.utils import degree
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score
from ipynb.fs.full.SpatialConv import SpatialConv

### GNN option 1

In [46]:
GNNconv = SAGEConv

In [47]:
class AGSGSAGE(torch.nn.Module):
    def __init__(self, num_features,num_classes, hidden_channels=16):
        super().__init__()
        self.num_classes = num_classes
             
        self.conv1 = GNNconv(num_features, hidden_channels)
        #self.conv2 = GNNconv(hidden_channels,hidden_channels)
        self.conv3 = GNNconv(hidden_channels,num_classes)

    def forward(self, batch_data):
        x = batch_data.x
        edge_index = batch_data.edge_index
        edge_weight = batch_data.edge_weight
        
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index, edge_weight)
#         x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv3(x, edge_index, edge_weight)
        
        return x

## GNN Training and Testing

In [48]:
from torch_geometric.loader import NeighborSampler, NeighborLoader
# from ipynb.fs.full.a1AGS_Node_Sampler_Fast import WeightedNeighborLoader

In [49]:
def test(model, loader, mask, name='Train'):    
    if args.log_info:    
        pbar = tqdm(total=sum(mask).item())
        pbar.set_description(f'Evaluating {name}')
    
    model.eval()
    
    total_correct=0
    total_examples=0
    
    sigmoid = nn.Sigmoid()    
    
    y_true = []
    y_pred = []
    
    with torch.no_grad():                  
    
        for i,batch_data in enumerate(loader):
            
            batch_data = batch_data.to(device)
            batch_data.edge_weight=None
            used = batch_data.batch_size
            
            out = model(batch_data)
                   
            out=out[:used,:]
            pred = out.argmax(dim=1)            

            y_true.append(batch_data.y[:used].detach().cpu().numpy())
            y_pred.append(pred.detach().cpu().numpy())
            
            if args.log_info:
                pbar.update(used)
              
    if args.log_info:
        pbar.close()
    
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    
    acc = accuracy_score(y_true, y_pred)
    #acc = f1_score(y_true, y_pred, average='micro')
                    
    return acc

In [50]:
def train(DATASET_NAME, model, data, epochs=100, train_neighbors=[-1,10], test_neighbors=[-1,10]):
    
    if args.log_info:
        print("Train neighbors: ", train_neighbors)
        print("Test neighbors: ", test_neighbors)
        
#     optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    optimizer = torch.optim.Adam(model.parameters())
    
    if data.y.ndim == 1:
        criterion = torch.nn.CrossEntropyLoss()
    else:
        criterion = torch.nn.BCEWithLogitsLoss()
    
    batch_size=4096         
    worker = 0
    
    if data.num_nodes>=100000:
        worker = 8
    
#     if data.num_nodes>=10000:
#         worker = min(8,int(sum(data.train_mask)/batch_size))
        
    if args.log_info:
        print("Worker: ", worker)
        
# #     weight_func=['knn','submodular']; 
#     weight_func=['knn']; 
# #     weight_func=['random', 'random'];  worker = 0;
# #     weight_func=['link-nn', 'link-sub'];  worker = 2;
#     params={
#         'knn':{'metric':'cosine'},
#         'submodular':{'metric':'cosine'},
#         'link-nn':{'value':'min'},
#         'link-sub':{'value':'max'},
#     }    
    
#     sampler_dir = DIR+'AGSGSAGE/'+DATASET_NAME
# #     if not os.path.exists(sampler_dir):
# #         os.makedirs(sampler_dir)
    
#     start = time.time()    
#     loader = WeightedNeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
#                               batch_size=batch_size, shuffle=True, num_workers=worker, drop_last=False, 
#                               weight_func=weight_func, params=params, log=args.log_info,
#                                     directed=True, replace = False,
#                                     save_dir = sampler_dir,recompute = False)

#     train_loader = WeightedNeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
#                               batch_size=batch_size, shuffle=False, num_workers=worker, drop_last=False, 
#                               weight_func=weight_func, params=params, log=args.log_info,
#                                           directed=True, replace = False,
#                                           save_dir = sampler_dir,recompute = False)
    
#     val_loader = WeightedNeighborLoader(data, input_nodes=data.val_mask,num_neighbors=test_neighbors, 
#                               batch_size=batch_size, shuffle=False, num_workers=min(8,int(sum(data.val_mask)/batch_size)), drop_last=False, 
#                               weight_func=weight_func, params=params,log=args.log_info, directed=True, replace = False,
#                                         save_dir = sampler_dir,recompute = False)
    
#     test_loader = WeightedNeighborLoader(data, input_nodes=data.test_mask,num_neighbors=test_neighbors, 
#                               batch_size=batch_size, shuffle=False, num_workers=min(8,int(sum(data.test_mask)/batch_size)), drop_last=False, 
#                               weight_func=weight_func, params=params, log=args.log_info, directed=True, replace = False,
#                                          save_dir = sampler_dir,recompute = False)
    
    start = time.time()    
    loader = NeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
                              batch_size=batch_size, shuffle=True, num_workers=worker, drop_last=False, 
                              directed=True, replace = False)

    train_loader = NeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
                              batch_size=batch_size, shuffle=False, num_workers=worker, drop_last=False, 
                              directed=True, replace = False)
    
    val_loader =  NeighborLoader(data, input_nodes=data.val_mask,num_neighbors=test_neighbors, 
                              batch_size=batch_size, shuffle=False, num_workers=worker, drop_last=False, 
                              directed=True, replace = False)
    
    test_loader = NeighborLoader(data, input_nodes=data.test_mask,num_neighbors=test_neighbors, 
                              batch_size=batch_size, shuffle=False, num_workers=worker, drop_last=False, 
                              directed=True, replace = False)
    
    
    top_k_accs = []    
    best_acc=0  
    
    train_losses=[]
    val_accuracies=[]
    train_accuracies=[]
    test_accuracies=[]
    training_times = []
    
    num_iteration = epochs
    
    end = time.time()
    if args.log_info:
        print("Total initialization time: ", end-start)
        
    
    
    
    start = time.time()
    
    for epoch in range(1,epochs+1):
        
        if args.log_info:
            pbar = tqdm(total=int(sum(data.train_mask)))
            pbar.set_description(f'Epoch {epoch:02d}')
        
        model.train()
        total_loss = total_examples = 0
        
        epoch_start = time.time()
        
        for i,batch_data in enumerate(loader):            
            #print(batch_data)
            
            batch_data = batch_data.to(device)
            batch_data.edge_weight=None
            used = batch_data.batch_size #int(sum(batch_data.train_mask))       
            
            optimizer.zero_grad()
            out = model(batch_data)
            #out = F.log_softmax(out, dim=1)                 
            #loss = F.nll_loss(out[batch_data[0].train_mask], batch_data[0].y[batch_data[0].train_mask])
            #loss = F.cross_entropy(out[:used], batch_data[0].y[:used])
            loss = criterion(out[:used], batch_data.y[:used])
            
            loss.backward()
            optimizer.step()
                        
            total_loss += loss.item() * used
            total_examples += used
            
            if args.log_info:
                pbar.update(used)
        if args.log_info:
            pbar.close()
            
        epoch_end = time.time()
        training_times.append(epoch_end-epoch_start)
        
        if epoch%10==0:
            loss=total_loss / total_examples
            train_losses.append(loss)

            #print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}', end = ', ')                

            if args.log_info:
                train_acc=test(model, train_loader,data.train_mask,'Train')            
                train_accuracies.append(train_acc.item())        
            else:
                train_acc = 0 ; train_accuracies.append(train_acc)

            if args.log_info:
                val_acc = test(model, val_loader,data.val_mask,'Validation')
                val_accuracies.append(val_acc.item())
            else:
                val_acc = 0 ; val_accuracies.append(val_acc)

            test_acc = test(model, test_loader,data.test_mask,'Test')
            test_accuracies.append(test_acc.item())
            #print(f'Epoch: {epoch:03d}, Test: {test_acc:.4f}')

            std_dev = np.std(train_losses[-5:])
            #print(f'Epoch: {epoch:03d}, Std dev: {std_dev:.4f}')

            if args.log_info:
                print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}, Std dev: {std_dev:.4f}')

    #         if epoch>=5 and std_dev<=1e-3:
    #             num_iteration = epoch

    #             if args.log_info:                
    #                 print("Iteration for convergence: ", epoch)
    #             break

        if args.log_info:
            #save_plot([val_accuracies], labels=['Validation'], name='Plots/Validation', yname='Accuracy', xname='Epoch')    
            save_plot([train_losses, train_accuracies, val_accuracies, test_accuracies], labels=['Loss','Train','Validation','Test'], name='Results/AGSNSVal', yname='Accuracy', xname='Epoch')

            print ("Best Validation Accuracy, ",max(val_accuracies))
            print ("Best Test Accuracy, ",max(test_accuracies))
        
    best_acc = max(test_accuracies)
    
    end = time.time()
    if args.log_info:
        print("Total epoch time: ", end-start) 
        
    
    acc_file = open("Runtime/GSAGE_loader.txt",'a+') 
    acc_file.write(str(train_losses))
    acc_file.write(str(train_accuracies))
    acc_file.write(str(val_accuracies))
    acc_file.write(str(test_accuracies))
    acc_file.write(str(training_times))
    acc_file.write(str(np.mean(training_times)))
    acc_file.write(f'\nworker {worker:1d} avg epoch runtime {np.mean(training_times):0.8f}')
    acc_file.close()     
    
    
    return best_acc, num_iteration

In [51]:
def AGSNSperformanceSampler(DATASET_NAME, data, dataset, num_classes, epochs=1, train_neighbors=[-1,-1], test_neighbors=[-1,-1]):        
    
    model = AGSGSAGE(data.x.shape[1], num_classes, hidden_channels=256).to(device)
    
    if args.log_info: print(model)    
    
    best_acc, num_iteration = train(DATASET_NAME, model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    
    return best_acc, num_iteration, model

In [52]:
def adj_feature(data):    
    adj_mat = torch.zeros((data.num_nodes,data.num_nodes))
    edges = data.edge_index.t()
    adj_mat[edges[:,0], edges[:,1]] = 1
    return adj_mat

# adj_feature(data)
# data.x.shape

In [53]:
# args.log_info = True

# DATASET_NAME = 'karate'
# data, dataset = get_data(DATASET_NAME,DIR=None, log=False, h_score=True, split_no=0)
# print(data)

# # (row, col) = data.edge_index
# # data.edge_index = torch.stack((torch.cat((row, col),dim=0),torch.cat((col, row),dim=0)),dim=0)
# # data.edge_index = torch_geometric.utils.coalesce(data.edge_index)
# # print(data)

# # if DATASET_NAME in ['Squirrel', 'Chameleon']:
# #     data.x = torch.cat((data.x, adj_feature(data)), dim=1)
# #     if args.log_info == True:
# #         print(data.x.shape)
    
# best_acc, num_iteration, _ =  AGSNSperformanceSampler(DATASET_NAME, data, dataset, dataset.num_classes, epochs=150, train_neighbors=[8,4], test_neighbors=[8,4])
# print(best_acc, num_iteration)

# Batch Experiments

In [59]:
def batch_experiments(num_run=1):
    
    ALL_DATASETs= [
#         "Roman-empire","Texas",
#         "Squirrel","Chameleon",
#         "Cornell","Actor","Wisconsin","Flickr","Amazon-ratings",
#         "reed98",
#         "amherst41",
#         "genius",
#         "AmazonProducts",
#         "cornell5",
#         "penn94","johnshopkins55",
#         "Yelp",
#         "cora","Tolokers","Minesweeper",
#         "CiteSeer","Computers","PubMed","pubmed",
#         "Reddit",
#         "cora_ml","dblp",
        "Reddit2",
#         "Cora","CS","Photo","Questions","Physics","citeseer",
#         "Squirrel","Chameleon",
#         "Cora","Reddit","genius","Yelp",
#         'pokec','arxiv-year',
#         'snap-patents','twitch-gamer',
#         'wiki'
    ]
    
#     ALL_DATASETs= ["karate"]

    args.log_info = False
    
    filename = "Results/GSAGE_loader.txt"
    runtime_filename = "Runtime/GSAGE_loader.txt"
        
    for DATASET_NAME in ALL_DATASETs:  
        print(DATASET_NAME, end=' ')    
        
        result_file = open(filename,'a+')        
        result_file.write(f'{DATASET_NAME} ')
        result_file.close()
        
        acc_file = open(runtime_filename,'a+') 
        acc_file.write(f'{DATASET_NAME}\n')
        acc_file.close()     

                
        accs = []
        itrs = []
                
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i, random_state=i)   
            
#             #optional for making undirected graph
#             (row, col) = data.edge_index
#             data.edge_index = torch.stack((torch.cat((row, col),dim=0),torch.cat((col, row),dim=0)),dim=0)
#             data.edge_index = torch_geometric.utils.coalesce(data.edge_index)
            
#             if data.num_nodes>100000:
#                 accs.append(-1)
#                 itrs.append(-1)
#                 break
            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 150
            else:
                max_epochs = 50
                
            if DATASET_NAME in ['Squirrel', 'Chameleon', 
                                #'cornell5','penn94','johnshopkins55'
                               ]:
                data.x = torch.cat((data.x, adj_feature(data)), dim=1)
                if args.log_info == True:
                    print(data.x.shape)
                              
            accuracy, itr, _ = AGSNSperformanceSampler(DATASET_NAME, data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            
            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        #print(accs, itrs)
        print(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file = open(filename,'a+')
        result_file.write(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
# batch_experiments(num_run=5)

Reddit2 acc 0.8894 sd 0.0032 itr 50 sd 0


## View Learned Representation

In [55]:
# if __name__ == '__main__':    
    
#     n=7
#     x = torch.Tensor([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
#     y = torch.LongTensor([0,0,0, 1, 1, 1, 1])
#     edge_index = torch.LongTensor([[1,2],[1,4],[1,5],[2,1],[3,6],[3,7],[4,5],[4,1],[4,6],[4,7],[5,1],[5,4],[5,6],[6,3],[6,4],[6,5],[6,7],[7,3],[7,4],[7,6]]).T
#     edge_index = edge_index-1
    
#     mask = torch.zeros(n, dtype=torch.bool)
#     mask[[1,3]] = True
    
#     test_data = Data(x = x, y = y, edge_index = edge_index, train_mask = mask, test_mask = mask, val_mask = mask)    
#     print(test_data)
    
    
#     None

In [56]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

In [57]:
# model.eval()
# #X = model(data.x.to(device),data.edge_index.to(device), data.weight.to(device))
# X = model(data.x.to(device),data.edge_index.to(device))
# X = X.detach().to('cpu')
# y = data.y.to('cpu')
# X.shape

In [58]:
# plt.figure(figsize=(10, 10))

# # Create a t-SNE model with 2 components and a perplexity of 30
# tsne = TSNE(n_components=2, perplexity=30, random_state=42, learning_rate='auto', init='random')

# # Fit and transform the data to the 2D t-SNE space
# X_tsne = tsne.fit_transform(X)

# # Plot the data in the 2D t-SNE space, colored by class
# plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
# plt.show()