# Test Dataset to check

In [14]:
import multiprocessing

NUM_PROCESSORS=multiprocessing.cpu_count()
print("Cpu count: ",NUM_PROCESSORS)

Cpu count:  32


In [15]:
#as it turned out interactive shell (like Jupyter cannot handle CPU multiprocessing well so check which medium the code is runing)
#we will write code in Jupyter for understanding purposes but final execuation will be in shell
from ipynb.fs.full.Utils import isnotebook
from ipynb.fs.full.Dataset import get_data

import networkx as nx
from torch_geometric.utils import to_networkx, from_networkx
import torch_geometric.utils.homophily as homophily
import copy

In [16]:
import torch
import torch.nn as nn
from torch_sparse import SparseTensor
from tqdm import tqdm
import math
import time

import random
random.seed(12345)
import numpy as np
np.random.seed(12345)

In [17]:
from joblib import Parallel, delayed
from multiprocessing.pool import ThreadPool, Pool

## Ideal Ranking

In [18]:
class IdealWeight():
    
    def __init__(self, data, log=True):
        
        self.N = N = data.num_nodes
        self.E = E = data.num_edges
        self.data = data
        self.log =log

        self.adj = SparseTensor(
            row=data.edge_index[0], col=data.edge_index[1],
            value=torch.arange(E, device=data.edge_index.device),
            sparse_sizes=(N, N))
        
    def node_weight(self,u):
    
        row, col, edge_index = self.adj[u,:].coo()        
        y_true = self.data.y[u]        
        y_neighbor=self.data.y[col.tolist()]
        
        gains = (y_neighbor==y_true).int().tolist()        
        S_G = [g if g >0 else 0.01 for g in gains]
        S_edge=edge_index.tolist()
            
        return S_G, S_edge

    def get_ideal_weight(self):
        if self.log:
            pbar = tqdm(total=self.N)
            pbar.set_description(f'Nodes')

        edge_weight=[]
        edge_index=[]

        for u in range(self.N):            
            weight, e_index = self.node_weight(u)
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            if self.log:
                pbar.update(1)
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)        

        return weight
    
    def process_block(self, list_u):
        
        #print("Processing :",len(list_u), list_u[0], list_u[-1])
        
        edge_weight = []
        edge_index = []
        
        for u in list_u:        
            weight, e_index = self.node_weight(u)            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
        #print("Done :",len(list_u), list_u[0], list_u[-1])
            
        return edge_weight, edge_index, len(list_u)
    
    #multiprocessing
    def get_ideal_weight_multiproces(self):
        
        edge_weight=[]
        edge_index=[]        
        
        N = self.N
        #N = 1000
        
        #elem_size=1000
        #num_blocks = int(N/elem_size)
        num_blocks = NUM_PROCESSORS
        elem_size = int(N/num_blocks)
        
        
        nodes = np.arange(num_blocks*elem_size).reshape(num_blocks,-1).tolist()
        
        if num_blocks*elem_size<N:
            nodes.append(list(range(num_blocks*elem_size,N)))        
        
        pool_size = NUM_PROCESSORS        
        if self.log:
            print("Pool Size: ", pool_size)        
        pool = Pool(pool_size)
        
        if self.log:
            pbar = tqdm(total=N)
            pbar.set_description(f'Nodes')  
                
        for (weight, e_index, num_el) in pool.imap_unordered(self.process_block, nodes):            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
            if self.log:
                pbar.update(num_el)
        
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E        
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
    
    
    def compute_weights(self):   
        #if isnotebook():
        #weight = self.get_knn_weight()
        
        if self.data.num_nodes<10000:
            weight = self.get_ideal_weight()    
        else:
            weight = self.get_ideal_weight_multiproces()
        
        return weight

# Main

In [19]:
if __name__ == '__main__':  
    
    data, dataset = get_data('genius')
    
    ideal_weight = IdealWeight(data)

    start = time.time()    
    data.weight = ideal_weight.compute_weights()
    end = time.time()
    print("Execution time: ", end-start)
    
#     if 'weight' in data:
#         cp_data= copy.deepcopy(data)
#         G = to_networkx(cp_data, to_undirected=True, edge_attrs=['weight'])
#         to_remove = [(a,b) for a, b, attrs in G.edges(data=True) if attrs["weight"] <1.0 ]
#         G.remove_edges_from(to_remove)
#         updated_data = from_networkx(G)

#         print("Node Homophily:", homophily(updated_data.edge_index, cp_data.y, method='node'))
#         print("Edge Homophily:", homophily(updated_data.edge_index, cp_data.y, method='edge'))
#         print("Edge_insensitive Homophily:", homophily(updated_data.edge_index, cp_data.y, method='edge_insensitive'))    
        
    
    None

Data directory:  /scratch/gilbreth/das90/Dataset/
Result directory: /scratch/gilbreth/das90/Dataset/RESULTS/

Dataset: Genius(1):
Number of graphs: 1
Number of features: 12
Number of classes: 2

Data(x=[421961, 12], edge_index=[2, 984979], y=[421961], train_mask=[421961], val_mask=[421961], test_mask=[421961])
Number of nodes: 421961
Number of edges: 984979
Average node degree: 2.33
Number of training nodes: 253176
Training node label rate: 0.60
Has isolated nodes: True
Has self-loops: False
Is undirected: False
Pool Size:  32


Nodes: 100%|██████████| 421961/421961 [00:04<00:00, 91556.74it/s] 


Execution time:  4.990048170089722
