# Test Dataset to check

In [138]:
import random
import multiprocessing
import pandas as pd
import os
from tqdm import tqdm
import argparse
from argparse import ArgumentParser
import DeviceDir

DIR, RESULTS_DIR = DeviceDir.get_directory()
device, NUM_PROCESSORS = DeviceDir.get_device()

In [139]:
#as it turned out interactive shell (like Jupyter cannot handle CPU multiprocessing well so check which medium the code is runing)
#we will write code in Jupyter for understanding purposes but final execuation will be in shell
from ipynb.fs.full.Utils import isnotebook
from ipynb.fs.full.Dataset import get_data, generate_synthetic
import networkx as nx
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx, from_networkx
import torch_geometric.utils.homophily as homophily
import copy
import ipynb.fs.full.utils.MoonGraph as MoonGraph
import logging
from sklearn.metrics import f1_score, accuracy_score
from torch_geometric.utils import add_self_loops
import heapq

In [140]:
import argparse
from argparse import ArgumentParser
from ipynb.fs.full.Dataset import datasets as available_datasets

#set default arguments here
def get_configuration():
    parser = ArgumentParser()    
    parser.add_argument('--log_info', type=bool, default=True)
    parser.add_argument('--pbar', type=bool, default=False)
    parser.add_argument('--num_worker', type=int, default=0)
    parser.add_argument('--dataset', type=str, default="karate", choices=available_datasets)
    parser.add_argument('--epochs', type=int, default=150)
    parser.add_argument('--batch_size', type=int, default=4096)
    parser.add_argument('--num_neurons', type=int, default=64)
    parser.add_argument('-f') ##dummy for jupyternotebook
    args = parser.parse_args()
    
    dict_args = vars(args)
    
    return args, dict_args

args, dict_args = get_configuration()

In [141]:
import torch
import torch.nn as nn
from torch_sparse import SparseTensor
from tqdm import tqdm
import math
import time
import torch.nn.functional as F

import random
random.seed(12345)
import numpy as np
np.random.seed(12345)

In [142]:
import sklearn
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from multiprocessing.pool import ThreadPool, Pool

In [143]:
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv

In [144]:
# data, dataset = get_data('Cora', log=False)
# data

# adj_mat = torch.zeros((data.num_nodes,data.num_nodes))
# edges = data.edge_index.t()
# adj_mat[edges[:,0], edges[:,1]] = 1

# adj_train = adj_mat[data.train_mask].t()[data.train_mask].t()
# adj_validation = adj_mat[data.val_mask].t()[data.val_mask].t()
# adj_test = adj_mat[data.test_mask].t()[data.test_mask].t()

# print(adj_train.shape)
# print(adj_validation.shape)
# print(adj_test.shape)

# G_train=Data(edge_index=(adj_train.nonzero()).t(), x=data.x[data.train_mask], y=data.y[data.train_mask])
# G_val=Data(edge_index=(adj_validation.nonzero()).t(), x=data.x[data.val_mask], y=data.y[data.val_mask])
# G_test=Data(edge_index=(adj_test.nonzero()).t(), x=data.x[data.test_mask], y=data.y[data.test_mask])
# G_train

In [145]:
def corrupt_adj(adj_mat, task, percent=2):
    """ Returns the corrupted version of the adjacency matrix """
    if task == 'link':
        edges = adj_mat.triu().nonzero()
        num_edges = edges.shape[0]
        num_to_corrupt = int(percent/100.0 * num_edges)
        random_corruption = np.random.randint(num_edges, size=num_to_corrupt)
        adj_mat_corrupted = adj_mat.clone()
        false_edges, false_non_edges = [], []
        #Edge Corruption
        for ed in edges[random_corruption]:
            adj_mat_corrupted[ed[0], ed[1]] = 0
            adj_mat_corrupted[ed[1], ed[0]] = 0
            false_non_edges.append(ed.tolist())
        #Non Edge Corruption
        random_non_edge_corruption = list(np.random.randint(adj_mat.shape[0], size = 6*num_to_corrupt))
        non_edge_to_corrupt = []
        for k in range(len(random_non_edge_corruption)-1):
            to_check = [random_non_edge_corruption[k], random_non_edge_corruption[k+1]]
            if to_check not in edges.tolist():
                non_edge_to_corrupt.append(to_check)
            if len(non_edge_to_corrupt) == num_to_corrupt:
                break
        non_edge_to_corrupt = torch.Tensor(non_edge_to_corrupt).type(torch.int16)
        for n_ed in non_edge_to_corrupt:
            adj_mat_corrupted[n_ed[0], n_ed[1]] = 1
            adj_mat_corrupted[n_ed[1], n_ed[0]] = 1
            false_edges.append(n_ed.tolist())
    return adj_mat_corrupted, false_edges, false_non_edges

In [146]:
def sample_equal_number_edges_non_edges(adj_mat, false_non_edges, false_edges, small_samples):
    edges = adj_mat.nonzero()        
    num_edges = edges.shape[0]
    inverse_adj_mat = 1 - adj_mat
    non_edges = inverse_adj_mat.nonzero()
    num_non_edges  = non_edges.shape[0]
    
    edges_sampled = edges[np.random.randint(num_edges, size=min(num_edges,small_samples))]
    non_edges_sampled = non_edges[np.random.randint(num_non_edges, size=min(num_non_edges,small_samples))]
    final_edges = []
    final_non_edges = []
    for ed in edges_sampled.tolist():
        if ed not in false_edges:
            final_edges.append(ed)
    final_edges += false_non_edges
    for n_ed in non_edges_sampled.tolist():
        if n_ed not in false_non_edges:
            final_non_edges.append(n_ed)
    final_non_edges += false_edges

    return final_edges, final_non_edges

#edges, non_edges = sample_equal_number_edges_non_edges(adj_train, false_non_edges=[], false_edges=[], small_samples=10, N=34)


def train_sample_equal_number_edges_non_edges(adj_mat, false_non_edges, false_edges, small_samples, N = None):    
    edges = adj_mat.nonzero()   
    
#     print(edges)    
#     print(edges.shape[0])
    
    factor = 2
    if edges.shape[0]< factor*N:
        times = math.ceil(factor*N/edges.shape[0])
#         print(times)
        
        edges = torch.tile(edges, (times, 1))        
#         print(edges)
    
    edge_type = torch.zeros(edges.shape[0], dtype=torch.bool)
    
    if N is not None:
        indentity_edges = torch.cat((torch.LongTensor(range(N)).view(N,1), torch.LongTensor(range(N)).view(N,1)), dim=1)
        edges = torch.cat((edges, indentity_edges), dim=0)      
        edge_type = torch.cat((edge_type, torch.ones(indentity_edges.shape[0], dtype=torch.bool)), dim=0)                
    
    num_edges = edges.shape[0]
    inverse_adj_mat = 1 - adj_mat
    non_edges = inverse_adj_mat.nonzero()
    num_non_edges  = non_edges.shape[0]
    
    equal_sample_size = min(num_edges, small_samples, num_non_edges)
    
    edge_sample_index = np.random.randint(num_edges, size=equal_sample_size)
    
    edges_sampled = edges[edge_sample_index]
    edges_type_sampled = edge_type[edge_sample_index]
     
    identity_edges = edges_sampled[edges_type_sampled]
    edges_sampled = edges_sampled[~edges_type_sampled]
    
    non_edges_sampled = non_edges[np.random.randint(num_non_edges, size=equal_sample_size)]
    edges_type_sampled = torch.cat((torch.zeros(edges_sampled.shape[0], dtype=torch.bool),                                    
                                    torch.zeros(non_edges_sampled.shape[0], dtype=torch.bool),
                                    torch.ones(identity_edges.shape[0], dtype=torch.bool)), dim=0)
        
    final_edges = []
    final_non_edges = []
    
    ##need to update for edge_typye_sample
    for ed in edges_sampled.tolist():
        if ed not in false_edges:
            final_edges.append(ed)
    
    final_edges += false_non_edges
    for n_ed in non_edges_sampled.tolist():
        if n_ed not in false_non_edges:
            final_non_edges.append(n_ed)
    final_non_edges += false_edges

    return final_edges, final_non_edges, identity_edges, edges_type_sampled

# edges, non_edges, identity_edges, edges_type = train_sample_equal_number_edges_non_edges(adj_train, false_non_edges=[], false_edges=[], small_samples=10, N=34)
# edges, non_edges, identity_edges, edges_type

In [147]:
class MiniBatcher(object):
    def __init__(self, batch_size, n_examples, shuffle=True):
        assert batch_size <= n_examples, "Error: batch_size is larger than n_examples"
        self.batch_size = batch_size
        self.n_examples = n_examples
        self.shuffle = shuffle
        logging.info("batch_size={}, n_examples={}".format(batch_size, n_examples))

        self.idxs = np.arange(self.n_examples)
        if self.shuffle:
            np.random.shuffle(self.idxs)
        self.current_start = 0

    def get_one_batch(self):
        self.idxs = np.arange(self.n_examples)
        if self.shuffle:
            np.random.shuffle(self.idxs)
        self.current_start = 0
        while self.current_start < self.n_examples:
            batch_idxs = self.idxs[self.current_start:self.current_start+self.batch_size]
            self.current_start += self.batch_size
            yield torch.LongTensor(batch_idxs)
            
# train_batcher = MiniBatcher(2, 10)
# for i in range(3):
#     for train_idxs in train_batcher.get_one_batch():
#         print(train_idxs)
#     print("")

In [148]:
GNNlayer=GCNConv

class LinkModel(nn.Module):
    def __init__(self, input_rep, num_neurons=64):
        super(LinkModel, self).__init__()
        
        self.MLP1 = nn.Linear(input_rep,num_neurons)        
        #self.MLP2 = nn.Linear(num_neurons,num_neurons)
        self.MLP3 = nn.Linear(num_neurons*2,1)
        
    def forward(self, x, y):
                            
        x = self.MLP1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        y = self.MLP1(y)
        y = y.relu()
        y = F.dropout(y, p=0.5, training=self.training)
        
        xy=torch.cat((x-y,x*y),dim=1)  #         xy=x+y
        
#         z = self.MLP2(xy)
#         z = z.relu()
#         z = F.dropout(z, p=0.5, training=self.training)

        z = self.MLP3(xy)
        
#         z = torch.sigmoid(z)
#         z = z.relu()

        return z

    def compute_loss(self,x,y, target):
        
        pred = self.forward(x,y)        
        loss = F.cross_entropy(pred, target)                
        
        return loss        

In [149]:
# model = LinkModel(data.num_features, num_neurons=64).to(device)
# print(model)

In [150]:
train_false_non_edges=[]
train_false_edges = []

minibatch_size = args.batch_size

In [151]:
def predict(model, g_data, adj_mat, false_non_edges=[], false_edges=[], small_samples=100):    
    
#     print(g_data.y)
    
    g_data.to(device)
    model.eval()   
    
    edges, non_edges = sample_equal_number_edges_non_edges(adj_mat, false_non_edges=false_non_edges, false_edges=false_edges, small_samples=small_samples)
    
#     print(edges, " --- " ,non_edges)
    
    edges = torch.LongTensor(edges)
    non_edges = torch.LongTensor(non_edges)    
    
    edge_mask = (g_data.y[edges[:,0]]==g_data.y[edges[:,1]]).type(torch.float).to(device) 
    
    if non_edges.dim() == 1:
#         print(non_edges)
        non_edge_mask = non_edges.type(torch.float)
    else:
        non_edge_mask = (g_data.y[non_edges[:,0]]==g_data.y[non_edges[:,1]]).type(torch.float)
    
    samples = edges       
    target = edge_mask
    
#     print(samples)
#     print(target)
    
#     samples = torch.cat((edges, non_edges), dim=0).to(device)        
#     target = torch.cat((edge_mask, non_edge_mask),dim=0).type(torch.long).to(device)
    
    
    #target = torch.cat((torch.ones(len(edges)), torch.zeros(len(non_edges))),dim=0).type(torch.long).to(device)
    batcher = MiniBatcher(min(len(samples),minibatch_size), len(samples)) if minibatch_size > 0 else MiniBatcher(len(samples), len(samples))
    
    preds=np.array([])
    targets=np.array([])
    
    
    
#     pbar = tqdm(total=len(samples))
#     pbar.set_description(f'predicting: ')
        
    with torch.no_grad():                  
    
        for idxs in batcher.get_one_batch():

            idxs = idxs.to(device)
            test_edges=samples[idxs]
            test_target=target[idxs]
            
            out = model(g_data.x[test_edges[:,0]],g_data.x[test_edges[:,1]])
            
#             print(test_edges)
#             print(test_target)
#             print(out)
            
            pred = torch.zeros_like(out)
            pred[out >= 0.5] = 1
            
#             print(pred)
                              
#             pred = out.argmax(dim=1)
            
#             print(pred.shape, pred)
#             print(test_target.shape, test_target)
            
            pred = pred.cpu().numpy()
            test_target=test_target.cpu().numpy()
        
            preds = np.append(preds,pred)
            targets = np.append(targets,test_target)

#             pbar.update(len(idxs))

#         pbar.close()
    
    micro=f1_score(targets, preds, average='micro')
    weighted=f1_score(targets, preds, average='weighted')
    acc=accuracy_score(targets, preds)
    
    return acc, micro, weighted

# minibatch_size = 10
# predict(model, G_train, adj_train, small_samples=10)

In [152]:
def train(model, data, G_train, adj_train, selfloop = False, log = True, epochs=1, small_samples=100):    
    
    if log:
        if selfloop:
            print("SelfLoop used")
        else:
            print("Selfloop is omitted for other data")
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    #criterion = torch.nn.CrossEntropyLoss()
    criterion = nn.MSELoss()
    
    worker = 0     
    train_losses=[]
    
    G_train.to(device)
    
    for epoch in range(1,epochs+1):
        
        if selfloop:
            edges, non_edges, identity_edges, edge_types = train_sample_equal_number_edges_non_edges(adj_train, 
                                                                   false_non_edges=train_false_non_edges, 
                                                                   false_edges=train_false_edges, 
                                                                   small_samples=small_samples, N = data.num_nodes)

            edge_types = edge_types.to(device)
            identity_edges = identity_edges.to(device)

            edges = torch.LongTensor(edges).to(device)
            non_edges = torch.LongTensor(non_edges).to(device)  

            samples = torch.cat((edges, non_edges, identity_edges), dim=0).to(device)    
            edge_mask = (G_train.y[edges[:,0]]==G_train.y[edges[:,1]]).type(torch.float)
            #edge_mask = (2*edge_mask-1).to(device)
#             print(edge_mask)
            
            if non_edges.dim() == 1:
#                 print(non_edges)
                non_edge_mask = non_edges.type(torch.float)
            else:
                #non_edge_mask = torch.zeros(len(non_edges)).to(device)
                non_edge_mask = (G_train.y[non_edges[:,0]]==G_train.y[non_edges[:,1]]).type(torch.float).to(device)
                
            
#             print(edges)
#             print(identity_edges)
#             print(non_edges)            
#             print(identity_edges.shape)
#             print(edge_mask.shape)
#             print(edge_mask)
#             print(non_edge_mask.shape)
#             print(non_edge_mask)

            identity_edges_mask = torch.ones(identity_edges.shape[0], dtype=torch.float).to(device)

            target = torch.cat((edge_mask, non_edge_mask, identity_edges_mask),dim=0).to(device)

            assert edge_types.shape[0] == target.shape[0]
        else:
            
            edges, non_edges = sample_equal_number_edges_non_edges(adj_train, 
                                                                   false_non_edges=train_false_non_edges, 
                                                                   false_edges=train_false_edges, 
                                                                   small_samples=small_samples)

            edges = torch.LongTensor(edges).to(device)
            non_edges = torch.LongTensor(non_edges).to(device)  

            samples = torch.cat((edges, non_edges), dim=0).to(device)    
            edge_mask = (G_train.y[edges[:,0]]==G_train.y[edges[:,1]]).type(torch.float)
            #edge_mask = (2*edge_mask-1).to(device)

#             print(edge_mask)
            
            if non_edges.dim() == 1:
#                 print(non_edges)
                non_edge_mask = non_edges.type(torch.float)

            else:                
                #non_edge_mask = torch.zeros(len(non_edges)).to(device)
                non_edge_mask = (G_train.y[non_edges[:,0]]==G_train.y[non_edges[:,1]]).type(torch.float).to(device)

            target = torch.cat((edge_mask, non_edge_mask),dim=0).to(device)

    #         samples = torch.cat((torch.Tensor(edges), torch.Tensor(non_edges)),dim=0).type(torch.long).to(device)
    #         target = torch.cat((torch.ones(len(edges)), torch.zeros(len(non_edges))),dim=0).type(torch.long).to(device)
        
        train_batcher = MiniBatcher(min(len(samples),minibatch_size), len(samples)) if minibatch_size > 0 else MiniBatcher(len(samples), len(samples))
    
        total_loss = total_examples = 0
        y_pred=[]
        y_true=[]
        
#         pbar = tqdm(total=len(samples))
#         pbar.set_description(f'Epoch {epoch:02d}')
        
        model.train()        
        for train_idxs in train_batcher.get_one_batch():
            
            train_idxs = train_idxs.to(device)

            if selfloop:
            
                train_edge_types = edge_types[train_idxs]
                train_edges = samples[train_idxs]
                train_target = target[train_idxs]

                x = G_train.x[train_edges[~train_edge_types][:,0]]
                y = G_train.x[train_edges[~train_edge_types][:,1]]
                t = train_target[~train_edge_types]

                x1 = data.x[train_edges[train_edge_types][:,0]].to(device)
                y1 = data.x[train_edges[train_edge_types][:,1]].to(device)
                t1 = train_target[train_edge_types]

                x = torch.cat((x,x1),dim=0)
                y = torch.cat((y,y1),dim=0)
                train_target = torch.cat((t,t1),dim=0)

                optimizer.zero_grad()
                out = model(x,y)
                
            else:
                train_edges=samples[train_idxs]
                train_target=target[train_idxs]

                optimizer.zero_grad()
                out = model(G_train.x[train_edges[:,0]],G_train.x[train_edges[:,1]])

            
            #loss = F.nll_loss(out, train_target)
            loss = criterion(out, train_target.view(-1,1))
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item() * len(train_idxs)
            total_examples += len(train_idxs)
        
#             pbar.update(len(train_idxs))
#         pbar.close()
        
        loss=total_loss / total_examples
        train_losses.append(loss)
        
        if log: 
            print(f'Epoch {epoch:03d} Loss {loss:.4f}', end=' ')        
            a,b,c = predict(model, G_train, adj_train, small_samples=minibatch_size)
        if log:        
            print(f'\t{a:.4f},\t{b:.4f},\t{c:.4f}')
        
        
    return model

# train(model, data, G_train, adj_train, log = True, epochs=200, small_samples=4096)

In [153]:
# predict(model, G_train, adj_train, small_samples=1024)

In [154]:
# print(predict(model, G_val, adj_validation, small_samples=1024))
# print(predict(model, G_test, adj_test, small_samples=1024))

# Train link

In [155]:
def train_link(data, selfloop = False, log=True):
    
    adj_mat = torch.zeros((data.num_nodes,data.num_nodes))
    #edge_index = data.edge_index
    edge_index, _ = add_self_loops(data.edge_index)            
    edges = edge_index.t()
    
    adj_mat[edges[:,0], edges[:,1]] = 1
    adj_mat[edges[:,1], edges[:,0]] = 1
    
    adj_train = adj_mat[data.train_mask].t()[data.train_mask].t()
    adj_validation = adj_mat[data.val_mask].t()[data.val_mask].t()
    adj_test = adj_mat[data.test_mask].t()[data.test_mask].t()
    
    G_train=Data(edge_index=(adj_train.nonzero()).t(), x=data.x[data.train_mask], y=data.y[data.train_mask])
    G_val=Data(edge_index=(adj_validation.nonzero()).t(), x=data.x[data.val_mask], y=data.y[data.val_mask])
    G_test=Data(edge_index=(adj_test.nonzero()).t(), x=data.x[data.test_mask], y=data.y[data.test_mask])
    G_train
    
    model = LinkModel(data.num_features, num_neurons=args.num_neurons).to(device)
    
    if log:
        print(model)
    
    if data.num_nodes>100000:
        epochs = 5
    else:
        epochs = args.epochs
    
    model  = train(model, data, G_train, adj_train, selfloop, log, epochs=epochs, small_samples=minibatch_size)
    
    if log:
        print(predict(model, G_train, adj_train, small_samples=minibatch_size))
        print(predict(model, G_val, adj_validation, small_samples=minibatch_size))
        print(predict(model, G_test, adj_test, small_samples=minibatch_size))
    
    return model

In [156]:
# args.epochs =150
# data, dataset = get_data('karate', log=False)
# model = train_link(data, selfloop = True, log = True)

In [157]:
# # link self attention is considered for training data only, test, validation node self attention is not covered
# data, dataset = get_data('karate', log = False)
# model = train_link(data, selfloop = True, log = True)

# x = data.x
# x_col1 = x.to(device)
# x_col2 = x.to(device)
# output = model(x_col1, x_col2)

# print(output)
# # output = output.softmax(dim=1)
# # second_column = output[:,1].cpu()        
# # print(second_column) 

## KNN Ranking

In [158]:
class LinkPred():
    
    def __init__(self, data, selfloop = False, log=True):
        
#         edge_index, _ = add_self_loops(data.edge_index)        
#         data.edge_index = edge_index
        
        self.N = N = data.num_nodes
        self.E = E = data.num_edges
        self.data = data
        self.log = log
        self.selfloop = selfloop
        
        self.adj = SparseTensor(
            row=data.edge_index[0], col=data.edge_index[1],
            value=torch.arange(E, device=data.edge_index.device),
            sparse_sizes=(N, N))
                
        self.model = train_link(data, selfloop, log)
        self.model.eval()
        
    def lazy_greedy_weight(self,u):
    
        row, col, edge_index = self.adj[u,:].coo()   
        
        if len(col)==0:
            return [],[]
                
#         print(self.data.x[u].repeat(len(col),1).shape)
#         print(self.data.x[col.tolist()].shape)
        
        x = self.data.x[u].repeat(len(col),1).to(device)
        y = self.data.x[col.tolist()].to(device)
        
        outs  = self.model(x, y)        
#         outs = outs.softmax(dim=1)
#         w = outs[:,1].cpu()        
        w = outs.cpu().view(-1)
        #w = torch.clamp(w.round()+0.01, max=1.0)        

        S_G = w.tolist()
        S_edge = edge_index.tolist()
        
        return S_G, S_edge

    def get_submodular_weight(self):
        
        if self.log:
            pbar = tqdm(total=self.N)
            pbar.set_description(f'Nodes')

        edge_weight=[]
        edge_index=[]

        for u in range(self.N):            
            weight, e_index = self.lazy_greedy_weight(u)
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            if self.log:
                pbar.update(1)
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)        
        
        return weight
    
    def process_block(self, list_u):
        
        #print("Processing :",len(list_u), list_u[0], list_u[-1])
        
        edge_weight = []
        edge_index = []
        
        for u in list_u:        
            weight, e_index = self.lazy_greedy_weight(u)            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
        #print("Done :",len(list_u), list_u[0], list_u[-1])
            
        return edge_weight, edge_index, len(list_u)
    
    #multiprocessing
    def get_submodular_weight_multiproces(self):
        
        edge_weight=[]
        edge_index=[]        
        
        N = self.N
        #N = 1000
        
        elem_size=100
        num_blocks = int(N/elem_size)
        nodes = np.arange(num_blocks*elem_size).reshape(num_blocks,-1).tolist()
        if num_blocks*elem_size<N:
            nodes.append(list(range(num_blocks*elem_size,N)))        
        
        pool_size = NUM_PROCESSORS        
        print("Pool Size: ", pool_size)        
        pool = Pool(pool_size)
                
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')  
                
        for (weight, e_index, num_el) in pool.imap_unordered(self.process_block, nodes):            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            pbar.update(num_el)
        
        pbar.close()
        
        assert len(edge_index)==self.E        
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
    
    
    def compute_weights(self):
        if isnotebook() or self.data.num_nodes<1000:
            weight = self.get_submodular_weight()    
        else:
            weight = self.get_submodular_weight_multiproces()
        
        return weight

In [159]:
# data, dataset = get_data('texas')
# submodular_weight = LinkPred(data)
# #submodular_weight.lazy_greedy_weight(3)
# weight = submodular_weight.compute_weights()
# weight

In [160]:
# len(weight)

In [161]:
# torch.save(weight, 'Weights/moon_weight.pt')
# torch.load('Weights/moon_weight.pt')

In [162]:
class LinkNN():
    
    def __init__(self, data, value='min', log=True):
        
        self.N = N = data.num_nodes
        self.E = E = data.num_edges
        self.data = data        
        self.value = value
        self.log = log
        
        self.sign = 1
        
        if value=='min':
            self.sign = -1
            
        self.adj = SparseTensor(
            row=data.edge_index[0], col=data.edge_index[1],
            value=torch.arange(E, device=data.edge_index.device),
            sparse_sizes=(N, N))
        
    def lazy_greedy_weight(self,u):
    
        row, col, edge_index = self.adj[u,:].coo()           
        
        target_class_sim = self.data.weight[edge_index]
        ind = np.argsort(self.sign*target_class_sim) #-1*desending, normal will be ascending
        
#         print(u, row, col, edge_index)
#         print(target_class_sim)
#         print(ind)
         
        lambda1 = 0.25 #top 25% with probability 1
        lambda2 = 0.25 #second 25% with probability 0.5 
        
        l1=math.ceil(len(col)*lambda1)
        l2=min(len(col)-l1,math.ceil(len(col)*lambda2))        
        l3=max(0,int(len(col)-l1-l2))
        
#         print(len(col),l1, l2, l3)
        
#         S_G = np.ones(l1, dtype=float)*1.0
#         S_G = np.append(S_G, np.ones(l2, dtype=float)*0.5)
#         if(l3>0):
#             S_G = np.append(S_G, np.ones(l3, dtype=float)*0.1)

        S_G = np.ones(l1, dtype=float)*1.0
        S_G = np.append(S_G, np.ones(l2, dtype=float)*0.5)
        
        if(l3>0):
            S_G = np.append(S_G, np.ones(l3, dtype=float)*0.1)
        
        S_G = S_G.tolist()
        
#         S_G = list(range(1,len(col)+1))
        S_edge = edge_index[ind].tolist()
        
        return S_G, S_edge

    def get_submodular_weight(self):
        
        if self.log:        
            pbar = tqdm(total=self.N)
            pbar.set_description(f'Nodes')

        edge_weight=[]
        edge_index=[]

        for u in range(self.N):            
            weight, e_index = self.lazy_greedy_weight(u)
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            if self.log:        
                pbar.update(1)
        if self.log:        
            pbar.close()
        
        assert len(edge_index)==self.E
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
    
    def process_block(self, list_u):
        
        #print("Processing :",len(list_u), list_u[0], list_u[-1])
        
        edge_weight = []
        edge_index = []
        
        for u in list_u:        
            weight, e_index = self.lazy_greedy_weight(u)            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
        #print("Done :",len(list_u), list_u[0], list_u[-1])
            
        return edge_weight, edge_index, len(list_u)
    
    #multiprocessing
    def get_submodular_weight_multiproces(self):
        
        edge_weight=[]
        edge_index=[]        
        
        N = self.N
        #N = 1000
        
        elem_size=100
        num_blocks = int(N/elem_size)
        nodes = np.arange(num_blocks*elem_size).reshape(num_blocks,-1).tolist()
        if num_blocks*elem_size<N:
            nodes.append(list(range(num_blocks*elem_size,N)))        
        
        pool_size = NUM_PROCESSORS        
        print("Pool Size: ", pool_size)        
        pool = Pool(pool_size)
                
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')  
                
        for (weight, e_index, num_el) in pool.imap_unordered(self.process_block, nodes):            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            pbar.update(num_el)
        
        pbar.close()
        
        assert len(edge_index)==self.E        
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
    
    
    def compute_weights(self):
        if isnotebook() or self.data.num_nodes<1000:
            weight = self.get_submodular_weight()    
        else:
            weight = self.get_submodular_weight_multiproces()
        
        return weight

In [163]:
class LinkSub():
    
    def __init__(self, data, value='max', selfloop = False, log = True):
        
        self.N = N = data.num_nodes
        self.E = E = data.num_edges
        self.data = data
        self.log = log
        self.selfloop = selfloop
        
        self.X = data.x.to(device)
        
        self.model = train_link(data, selfloop = selfloop, log= log)
        self.model.eval()        

        self.adj = SparseTensor(
            row=data.edge_index[0], col=data.edge_index[1],
            value=torch.arange(E, device=data.edge_index.device),
            sparse_sizes=(N, N))
        
        if self.log:
            print("value: ", value)
        
        self.value = value
        self.sign = -1
        
        if self.value == 'max':
            self.sign = 1 ##-1 select the nearest ones, 1 for the farthest        
            
        elif self.value == 'min':
            self.sign = -1
        else:
            raise 'Not implemented error'
    
    def pairwise_link(self, x):  
                
        n, f = x.shape
        
        x_col1 = x.repeat_interleave(n, dim=0)
        x_col2 = x.repeat(n,1)
        # print(x_col1, x_col2)
        
        output = self.model(x_col1, x_col2).cpu()
        #print(output.shape)

#         output = output.softmax(dim=1)
#         second_column = output[:,1].cpu()        
        #print(second_column)
        
        similarity_matrix = output.view(n,n)
        
#         print(similarity_matrix)
        
        return similarity_matrix
        
    def lazy_greedy_weight(self,u):
        
        row, col, edge_index = self.adj[u,:].coo()
        vertices = [u]+col.tolist()
        
        v2i={i:j for i,j in zip(vertices, range(len(vertices)))}
        i2v={value:key for key, value in v2i.items()}
        
        kernel_dist = self.pairwise_link(self.X[vertices])
        
        gain_list=[(self.sign*kernel_dist[v2i[u],v2i[v.item()]],v.item(), e.item()) for v,e in zip(col,edge_index)] 
        #-1 selecting nearest
        #1 selecting farthest

        heapq.heapify(gain_list)
        #print(gain_list)

        S=[u]
        S_G=[]
        S_edge=[]
        S_index=[v2i[u]]
        
        lambda1 = 0.25 #top 25% with probability 1
        lambda2 = 0.25 #second 25% with probability 0.5         
        l1=math.ceil(len(col)*lambda1)
        l2=min(len(col)-l1,math.ceil(len(col)*lambda2))
        l3=max(0,int(len(col)-l1-l2))
        
        #print(len(col),l1, l2, l3)
        
        rank=1 #rank weight instead gain weight
        
        while(gain_list):
            (gain_v, v, e) = heapq.heappop(gain_list)
            gain_v = self.sign*gain_v #make it positive
            #print(gain_v, v)

            if len(gain_list)==0:
                if gain_v<1e-6:
                    gain_v=1e-6
                S.append(v)
                #S_G.append(gain_v)
                #S_G.append(rank)
                
                if rank <= l1:
                    S_G.append(1.0)
                
                elif rank<=l1+l2:
                    S_G.append(0.5)
                
                else:
                    S_G.append(0.1)
                
                rank+=1
                
                S_edge.append(e)
                S_index.append(v2i[v])
                
                break
            
            gain_v_update = self.sign*min(kernel_dist[v2i[v],S_index])
            
            #print("updated: ", S,v,gain_v_update, gain_v)
            (gain_v_second,v_second,_)=gain_list[0] #top
            gain_v_second = gain_v_second #make it positive

            if gain_v_update<=gain_v_second:
                
                gain_v_update = self.sign*gain_v_update
                
                if gain_v_update<1e-6:
                    gain_v_update=1e-6
                S.append(v)
                #S_G.append(gain_v_update)
                #S_G.append(rank)
                
                if rank<=l1:
                    S_G.append(1.0)
                elif rank<=l1+l2:
                    S_G.append(0.5)
                else:
                    S_G.append(0.1)
                rank+=1
                
                S_edge.append(e)
                S_index.append(v2i[v])
            else:
                heapq.heappush(gain_list,(self.sign*gain_v_update,v, e))

        return S_G, S_edge
    
    #serial
    def get_submodular_weight(self):
        
        N = self.N
        #N = 1000
        
        if self.log:
            pbar = tqdm(total=N)
            pbar.set_description(f'Nodes')

        edge_weight=[]
        edge_index=[]
        
        test = 0

        for u in range(N):                
            weight, e_index = self.lazy_greedy_weight(u)
            edge_weight.extend(weight)
            edge_index.extend(e_index)
        
            #test += sum((np.array(weight)>1.0).astype(int))
            if self.log:
                pbar.update(1)
        
        #print(test)
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E        
        
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)
        
        return weight
        
    
    def process_block(self, list_u):
        
        #print("Processing :",len(list_u), list_u[0], list_u[-1])
        
        edge_weight = []
        edge_index = []
        
        for u in list_u:        
            weight, e_index = self.lazy_greedy_weight(u)            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
        #print("Done :",len(list_u), list_u[0], list_u[-1])
            
        return edge_weight, edge_index, len(list_u)
        
    
    #multiprocessing
    def get_submodular_weight_multiproces(self):
        
        edge_weight=[]
        edge_index=[]        
        
        N = self.N
        #N = 1000
        
        elem_size=10
        num_blocks = int(N/elem_size)
        nodes = np.arange(num_blocks*elem_size).reshape(num_blocks,-1).tolist()
        if num_blocks*elem_size<N:
            nodes.append(list(range(num_blocks*elem_size,N)))        
        
        pool_size = NUM_PROCESSORS        
        print("Pool Size: ", pool_size)        
        pool = Pool(pool_size)
        
        if self.log:
            pbar = tqdm(total=N)
            pbar.set_description(f'Nodes')  
                
        for (weight, e_index, num_el) in pool.imap_unordered(self.process_block, nodes):            
            edge_weight.extend(weight)
            edge_index.extend(e_index)
            
            if self.log:
                pbar.update(num_el)
        
        if self.log:
            pbar.close()
        
        assert len(edge_index)==self.E
                
        weight=torch.zeros(len(edge_index))        
        weight[edge_index]=torch.Tensor(edge_weight)        
        
        return weight
    
    
    def compute_weights(self):
        
        if isnotebook() or self.data.num_nodes<1000:
            weight = self.get_submodular_weight()    
        else:
            weight = self.get_submodular_weight_multiproces()
        
        return weight
    
# data, dataset = get_data('karate', log=False)
# submodular_weight = LinkSub(data, selfloop = False, log = True)
# submodular_weight.lazy_greedy_weight(1)
# #data.weight = submodular_weight.compute_weights()    

In [164]:
# data, dataset = get_data('karate', log = False)
# submodular_weight = LinkSub(data, log = True)

In [165]:
# submodular_weight.lazy_greedy_weight(0)
#data.weight = submodular_weight.compute_weights()    

In [166]:
# x = data.x
# # x = torch.Tensor([[1,2],[3,4],[5,6]])
# #print(x)
# N, F = x.shape

# # x_col1 = x.to(device)
# # x_col2 = x.to(device)

# x_col1 = x.repeat_interleave(N, dim=0).to(device)
# x_col2 = x.repeat(N,1).to(device)
# # print(x_col1, x_col2)

# output = submodular_weight.model(x_col1, x_col2)
# print(output.shape)

# output = output.softmax(dim=1)
# second_column = output[:,1].cpu()        
# print(second_column)

# # similarity_matrix = second_column.view(N,N)
# # print(similarity_matrix)
# # print(similarity_matrix.shape)

# Main

In [167]:
# from ipynb.fs.full.Dataset import pearson_coff
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

def normalize_rows(arr):
    row_min = arr.min(axis=1, keepdims=True)
    row_max = arr.max(axis=1, keepdims=True)
    return (arr - row_min) / (row_max - row_min)

In [168]:
def link_correlation(DATASET_NAME):
    data, dataset = get_data(DATASET_NAME, log=False, h_score=True, split_no = 0)
    #data.train_mask = data.train_mask|data.val_mask|data.test_mask
    #data.train_mask = data.train_mask|data.val_mask
    args.epochs = 150
    args.num_neuron = 32
    #args.batch_size = 4096
    
    link_model = train_link(data, selfloop = True, log = True)
    
    feature_matrix = data.x.to(device)
    N = feature_matrix.shape[0]
    A = np.zeros((N,N))

    print(N)

    pbar = tqdm(total=N)
    pbar.set_description(f'Nodes')

    link_model.eval()
    with torch.no_grad():    
        for i in range(N):
            x = feature_matrix[i].repeat(N, 1)
            sim = link_model(x, feature_matrix)            
            
            pred = sim                        
#             pred = torch.zeros_like(sim)
#             pred[sim >= 0.5] = 1            
            pred = pred.cpu().numpy().reshape(-1)            
            A[i] = pred
            pbar.update(1)

    pbar.close()
    
    print(A)
#     A = normalize_rows(A)
#     print(A)
    
#     for i in range(N):
#         for j in range(N):
#             print(f'{A[i,j]:0.2f}',end=' ')
#         print("")
    
    similarity_scores = A
    labels = data.y
    features = data.x

    similarity_values = []
    label_matches = []

    pbar = tqdm(total=N)
    pbar.set_description(f'Nodes')
    # Compute similarity values and label matches
    for i in range(len(features)):
        for j in range(i + 1, len(features)):
            similarity_values.append(similarity_scores[i, j])
            label_matches.append(int(labels[i] == labels[j]))
        pbar.update(1)
    pbar.close()
    
    
    # Calculate Pearson's correlation coefficient
    correlation, _ = pearsonr(similarity_values, label_matches)

    print(correlation)
    
#     # Plot the correlation
#     plt.scatter(similarity_values, label_matches, alpha=0.5)
#     plt.title(f'Pearson Correlation: {correlation:.2f}')
#     plt.xlabel('Feature Similarity')
#     plt.ylabel('Label Match (1 if same, 0 if different)')
#     plt.show()

# link_correlation('karate')

In [169]:
if __name__ == '__main__':  
    
    log = True
    
    datasetname = args.dataset
    
    data, dataset = get_data(datasetname, log=log, h_score=True)
#     data = generate_synthetic(data, d=100, h=0.25, train=0.1, random_state=1, log=log)
    
    start = time.time() 
    submodular_weight = LinkPred(data, selfloop = True, log = log)
    data.weight = submodular_weight.compute_weights()    
    submodular_weight = LinkNN(data, value ='min', log = log) 
    data.weight = submodular_weight.compute_weights()    
    end = time.time()
    print("Execution time: ", end-start)


#     start = time.time()    
#     submodular_weight = LinkSub(data, value ='max', selfloop = True, log = log)    
#     data.weight = submodular_weight.compute_weights()    
#     end = time.time()
#     print("Execution time: ", end-start)
    
#     if 'weight' in data:
#         cp_data= copy.deepcopy(data)
#         G = to_networkx(cp_data, to_undirected=True, edge_attrs=['weight'])
#         to_remove = [(a,b) for a, b, attrs in G.edges(data=True) if attrs["weight"] <1.0 ]
#         G.remove_edges_from(to_remove)
#         updated_data = from_networkx(G)
        
#         print(updated_data)
        
#         updated_data = from_networkx(G, group_edge_attrs=['weight'])
#         updated_data.weight = updated_data.edge_attr.view(-1)
        
#         row, col = updated_data.edge_index
#         updated_data.edge_index = torch.stack((torch.cat((row, col),dim=0), torch.cat((col, row),dim=0)),dim=0)
#         updated_data.weight = torch.cat((updated_data.weight, updated_data.weight),dim=0)
        

#         print("Node Homophily:", homophily(updated_data.edge_index, cp_data.y, method='node'))
#         print("Edge Homophily:", homophily(updated_data.edge_index, cp_data.y, method='edge'))
#         print("Edge_insensitive Homophily:", homophily(updated_data.edge_index, cp_data.y, method='edge_insensitive'))    
        
    
    None

Data directory:  /scratch/gilbreth/das90/Dataset/
Result directory: /scratch/gilbreth/das90/Dataset/RESULTS/

Dataset: KarateClub():
Number of graphs: 1
Number of features: 34
Number of classes: 4

Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
Number of nodes: 34
Number of edges: 156
Average node degree: 4.59
Number of training nodes: 4
Training node label rate: 0.12
Has isolated nodes: False
Has self-loops: False
Is undirected: True
N  34  E  156  d  4.588235294117647 0.8020520210266113 0.7564102411270142 0.6170591711997986 -0.4756128787994385 LinkModel(
  (MLP1): Linear(in_features=34, out_features=64, bias=True)
  (MLP3): Linear(in_features=128, out_features=1, bias=True)
)
SelfLoop used
Epoch 001 Loss 0.2928 	0.6250,	0.6250,	0.4808
Epoch 002 Loss 0.3933 	0.6250,	0.6250,	0.4808
Epoch 003 Loss 0.2750 	0.6250,	0.6250,	0.4808
Epoch 004 Loss 0.3122 	0.5000,	0.5000,	0.3333
Epoch 005 Loss 0.3192 	0.5000,	0.5000,	0.3333
Epoch 006 Loss 0.3582 

Nodes: 100%|██████████| 34/34 [00:00<00:00, 2536.85it/s]
Nodes: 100%|██████████| 34/34 [00:00<00:00, 4290.46it/s]

Execution time:  0.6385443210601807





In [170]:
#torch.save(weight, 'Weights/link_weight.pt')
#torch.load('Weights/link_weight.pt')

In [171]:
# data, dataset = get_data('Cora', log=False)
# #     data = generate_synthetic(data, d=5, h=0.25, train=0.1, random_state=None, log=log)
# submodular_weight = LinkSub(data, value ='min', selfloop = True, log = True)        

In [172]:
# submodular_weight.lazy_greedy_weight(0)