In [1]:
import os
import sys
if not os.getcwd().endswith("Submodular"):
    sys.path.append('../Submodular')    

In [2]:
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

# Dataset

In [3]:
from ipynb.fs.full.Dataset import get_data
from ipynb.fs.full.Dataset import datasets as available_datasets
from ipynb.fs.full.Utils import save_plot

In [4]:
import argparse
from argparse import ArgumentParser

#set default arguments here
def get_configuration():
    parser = ArgumentParser()
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--batch_size', type=int, default=2048)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--num_gpus', type=int, default=-1)
    parser.add_argument('--parallel_mode', type=str, default="dp", choices=['dp', 'ddp', 'ddp2'])
    parser.add_argument('--dataset', type=str, default="Cora", choices=available_datasets)
    parser.add_argument('--use_normalization', action='store_false', default=True)
    parser.add_argument('-f') ##dummy for jupyternotebook
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

In [5]:
import torch.nn as nn
import numpy as np
from torch.nn import init
from random import shuffle, randint
import torch.nn.functional as F
from itertools import combinations, combinations_with_replacement
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
#from torch_geometric.data import NeighborSampler
import matplotlib.pyplot as plt
import sys
from torch_geometric.data import Data
import logging
import time
import copy
from tqdm import tqdm
from torch_geometric.loader import NeighborSampler, NeighborLoader

In [6]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [7]:
import random
import numpy as np
import torch

seed = 123

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
None

## GCNConv, GATConv, GINConv, SAGEConv model

In [8]:
from torch_geometric.nn import GCNConv, GATConv, GINConv, SAGEConv
#GNNconv = GCNConv
class GCN(torch.nn.Module):
    def __init__(self, num_features,num_classes, hidden_channels=16, GNNconv = GCNConv):
        super().__init__()        
        ##GNN layer
        if(GNNconv==GINConv):
            self.MLP1 = nn.Linear(num_features,hidden_channels)
            self.MLP2 = nn.Linear(hidden_channels,num_classes)
            self.conv1 = GNNconv(self.MLP1)
            self.conv2 = GNNconv(self.MLP2)                
        else:        
            self.conv1 = GNNconv(num_features, hidden_channels)
            self.conv2 = GNNconv(hidden_channels,num_classes)

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x
    


class GAT(torch.nn.Module):
    def __init__(self, num_features, num_classes, hidden_channels, heads):
        super().__init__()
        self.conv1 = GATConv(num_features, hidden_channels, heads, edge_dim=1)  # TODO
        self.conv2 = GATConv(hidden_channels*heads, num_classes, heads=1, concat=True, edge_dim=1)  # TODO

    def forward(self, x, edge_index, edge_attr=None):
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index, edge_attr)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index, edge_attr)
        return x

In [9]:
def test(model, loader, mask, name='Train'):
    
    if args.log_info:
        pbar = tqdm(total=sum(mask).item())
        pbar.set_description(f'Evaluating {name}')

    model.eval()
    
    total_correct=0
    total_examples=0
    
    with torch.no_grad():                  
    
        for i,batch_data in enumerate(loader):
            batch_data = batch_data.to(device)
            out = model(batch_data.x, batch_data.edge_index,batch_data.edge_weight)
            out=out[:batch_data.batch_size,:]
            pred = out.argmax(dim=-1)            
            correct = pred.eq(batch_data.y[:batch_data.batch_size].to(device))

            total_correct+=correct.sum()
            total_examples+=batch_data.batch_size
            
            if args.log_info:
                pbar.update(batch_data.batch_size)
    
    if args.log_info:
        pbar.close()

    return total_correct.item()/total_examples

In [10]:
def train(model, data, epochs=100, train_neighbors=[-1,10], test_neighbors=[-1,-1]):
    
    if args.log_info:
        print("Train neighbors: ", train_neighbors)
        print("Test neighbors: ", test_neighbors)
        
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()
    
    batch_size=1024
    loader = NeighborLoader(data, input_nodes=data.train_mask,num_neighbors=train_neighbors, 
                            batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = NeighborLoader(data,input_nodes=data.val_mask, num_neighbors=test_neighbors, 
                                batch_size=batch_size,shuffle=False, num_workers=0)
    test_loader = NeighborLoader(data, input_nodes=data.test_mask,num_neighbors=test_neighbors, 
                                 batch_size=batch_size,shuffle=False, num_workers=0)
        
        
    best_acc=0
    num_iteration = epochs
    train_losses = []
    for epoch in range(1,epochs+1):
        
        if args.log_info:
            pbar = tqdm(total=int(sum(data.train_mask)))
            pbar.set_description(f'Epoch {epoch:02d}')
        
        model.train()
        total_loss = total_examples = 0
        
        for i,batch_data in enumerate(loader):                
            batch_data = batch_data.to(device)
            optimizer.zero_grad()
            out = model(batch_data.x, batch_data.edge_index, data.edge_weight)
            
            #loss = F.nll_loss(out[batch_data.train_mask], batch_data.y[batch_data.train_mask])
            loss = criterion(out[batch_data.train_mask], batch_data.y[batch_data.train_mask])

            loss.backward()
            optimizer.step()
#             total_loss += loss.item() * batch_data.num_nodes
#             total_examples += batch_data.num_nodes
            
            total_loss += loss.item() * sum(batch_data.train_mask).item()
            total_examples += sum(batch_data.train_mask).item()
        
            if args.log_info:
                pbar.update(batch_size)
        
        if args.log_info:
            pbar.close()
        
        loss=total_loss / total_examples
        train_losses.append(loss)        
        
        #train_acc=test(model, train_loader,data.train_mask,'Train')
        train_acc=0
        val_acc = test(model, val_loader,data.val_mask,'Validation')
        test_acc = test(model, test_loader,data.test_mask,'Test')
                
        if test_acc>best_acc:
            best_acc=test_acc
        
        std_dev = np.std(train_losses[-5:])
        
        if args.log_info:
            print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}, Std dev: {std_dev:.4f}')
                
        if epoch>=5 and std_dev<=1e-3:
            num_iteration = epoch
            
            if args.log_info:                
                print("Iteration for convergence: ", epoch)
            break
                
    return best_acc, num_iteration


In [11]:
def GCNperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[-1,-1], test_neighbors=[-1,-1]):        
    model = GCN(dataset.num_features, num_classes, hidden_channels=256).to(device)
    if args.log_info:
        print(model)    
    
    best_acc, num_iteration = train(model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    return best_acc, num_iteration, model

def GATperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[-1,1], test_neighbors=[-1,-1]):        
    model = GAT(dataset.num_features, num_classes, hidden_channels=64, heads=4).to(device)    
    if args.log_info:
        print(model)    
    
    best_acc, num_iteration =train(model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    return best_acc, num_iteration, model

def GINperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[-1,1], test_neighbors=[-1,-1]):        
    model = GCN(dataset.num_features, num_classes, hidden_channels=256, GNNconv = GINConv).to(device)
    
    if args.log_info:
        print(model)    
    
    best_acc, num_iteration = train(model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    return best_acc, num_iteration, model

def GSAGEperformanceSampler(data, dataset, num_classes, epochs=1, train_neighbors=[-1,1], test_neighbors=[-1,-1]):        
    model = GCN(dataset.num_features, num_classes, hidden_channels=256, GNNconv = SAGEConv).to(device)
    
    if args.log_info:
        print(model)    
    
    best_acc, num_iteration = train(model, data, epochs, train_neighbors=train_neighbors, test_neighbors=test_neighbors)    
    return best_acc, num_iteration, model

In [12]:
args.log_info = True
DATASET_NAME = 'karate'
data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=True, split_no=0); print("")
print(data)
best_acc, num_iteration, _ = GCNperformanceSampler(data, dataset, dataset.num_classes, epochs=5, train_neighbors=[25,10])
#best_acc, num_iteration, _ = GATperformanceSampler(data, dataset, dataset.num_classes, epochs=5, train_neighbors=[25,10])
#best_acc, num_iteration, _ = GINperformanceSampler(data, dataset, dataset.num_classes, epochs=5, train_neighbors=[25,10])
#best_acc, num_iteration, _ = GSAGEperformanceSampler(data, dataset, dataset.num_classes, epochs=5, train_neighbors=[25,10])
print(best_acc, num_iteration)

N  34  E  156  d  4.588235294117647 0.8020520210266113 0.7564102411270142 0.6170591711997986 -0.4756128787994385 
Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
GCN(
  (conv1): GCNConv(34, 256)
  (conv2): GCNConv(256, 4)
)
Train neighbors:  [25, 10]
Test neighbors:  [-1, -1]


Epoch 01: : 1024it [00:00, 50476.18it/s]       
Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 13641.49it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15925.72it/s]


Epoch: 001, Train Loss: 1.3774, Train: 0.0000, Val: 0.3667, Test: 0.3667, Std dev: 0.0000


Epoch 02: : 1024it [00:00, 280588.44it/s]      
Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 15773.99it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15927.74it/s]


Epoch: 002, Train Loss: 1.2987, Train: 0.0000, Val: 0.3667, Test: 0.3667, Std dev: 0.0394


Epoch 03: : 1024it [00:00, 301084.28it/s]      
Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14768.68it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15465.72it/s]


Epoch: 003, Train Loss: 1.2437, Train: 0.0000, Val: 0.4333, Test: 0.4333, Std dev: 0.0549


Epoch 04: : 1024it [00:00, 265514.79it/s]      
Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 15006.45it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 15263.11it/s]


Epoch: 004, Train Loss: 1.1519, Train: 0.0000, Val: 0.4333, Test: 0.4333, Std dev: 0.0821


Epoch 05: : 1024it [00:00, 279893.60it/s]      
Evaluating Validation: 100%|██████████| 30/30 [00:00<00:00, 14910.43it/s]
Evaluating Test: 100%|██████████| 30/30 [00:00<00:00, 1479.98it/s]

Epoch: 005, Train Loss: 1.0792, Train: 0.0000, Val: 0.5333, Test: 0.5333, Std dev: 0.1053
0.5333333333333333 5





In [13]:
def batch_experiments(method_name = 'GCN', num_run=1):
    
    ALL_DATASETs= [
        "Roman-empire","Texas","Squirrel","Chameleon",
        "Cornell","Actor","Wisconsin","Flickr","Amazon-ratings","reed98","amherst41","genius",
        #"AmazonProducts",
        "cornell5","penn94","johnshopkins55",
#         "Yelp",
#         "cora","Tolokers","Minesweeper",
#         "CiteSeer","Computers","PubMed","pubmed",
#         #"Reddit",
#         "cora_ml","dblp",
#         #"Reddit2",
#         "Cora","CS","Photo","Questions","Physics","citeseer",
#         "Reddit", #remove this later
#         "Reddit2",#remove this later
#         "Yelp", #remove this later
#         "AmazonProducts",#remove this later
    ]
    
    #ALL_DATASETs= ["karate"]
    
    args.log_info = False
    
    for DATASET_NAME in ALL_DATASETs:  
        print(DATASET_NAME, end=' ')
        
        result_file = open("Results/"+method_name+".txt",'a+')        
        result_file.write(f'{DATASET_NAME} ')
        
        
        accs = []
        itrs = []
        
        for i in range(num_run):
            data, dataset = get_data(DATASET_NAME, DIR=None, log=False, h_score=False, split_no=i)   
            
            if data.num_nodes>100000:
                accs.append(-1)
                itrs.append(-1)
                break
            
            if len(data.y.shape) > 1:
                data.y = data.y.argmax(dim=1)        
                num_classes = torch.max(data.y).item()+1
            else:
                num_classes = dataset.num_classes
            
            if num_classes!= torch.max(data.y)+1:
                num_classes = torch.max(data.y).item()+1
                
            if data.num_nodes<100000:
                max_epochs = 150
            else:
                max_epochs = 50
             
            if method_name == 'GCN':
                accuracy, itr, _ =  GCNperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            elif method_name == 'GAT':
                accuracy, itr, _ =  GATperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            elif method_name == 'GIN':
                accuracy, itr, _ =  GINperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            elif method_name == 'GSAGE':
                accuracy, itr, _ =  GSAGEperformanceSampler(data, dataset, num_classes, epochs=max_epochs, train_neighbors=[8,4], test_neighbors=[8,4])
            
            accs.append(accuracy)
            itrs.append(itr)
            #print(itr, accuracy)
                        
        #print(accs, itrs)
        print(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f}, itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}')
        result_file.write(f'acc {np.mean(accs):0.4f} sd {np.std(accs):0.4f} itr {int(np.mean(itrs)):d} sd {int(np.std(itrs)):d}\n')
        result_file.close()
                
# batch_experiments(method_name = 'GCN', num_run=5)

In [14]:
# for method_name in ['GAT','GIN']:
#     batch_experiments(method_name = method_name, num_run=5)