In [32]:
from ipynb.fs.full.Dataset import get_data

In [33]:
from pathlib import Path
import pandas as pd
import os
import json
import numpy as np
from tqdm import tqdm
from torch_geometric.data import Data, Dataset
from torch_geometric.transforms import NormalizeFeatures
import torch_geometric.utils.homophily as homophily
import torch_geometric.utils.subgraph as subgraph

In [34]:
test_datsets = [
    "Cornell",
    "Texas",
    "Wisconsin",
    "reed98",
    "amherst41",
    "penn94",
    "Roman-empire",
    "cornell5",
    "Squirrel",
    "johnshopkins55",
    "AmazonProducts",
    "Actor",
    "Minesweeper",
    "Questions",
    "Chameleon",
    "Tolokers",
    "Flickr",
    "Yelp",
    "Amazon-ratings",
    "genius",
    "cora",
    "CiteSeer",
    "dblp",
    "Computers",
    "pubmed",
    "Reddit",
    "cora_ml",
    "Cora",
    "Reddit2",
    "CS",
    "Photo",
    "Physics",
    "citeseer"
]
# test_datsets = ['Texas','Cornell','Wisconsin','Squirrel','Chameleon','Cora']
# test_datsets = ['Texas']
test_datasets = ['pokec','arxiv-year','snap-patents','twitch-gamer']


# Pearson coffeicient

In [35]:
import torch
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import time
import math

def pearson_coff(features, labels, sim_score='cosine'):
   
    # Simulated data (replace with your actual data)
    # features = torch.rand((100, 128))  # 100 data points with 128-dimensional features
    # labels = torch.randint(0, 2, (100,))  # Binary labels

#     features = data.x
#     labels = data.y

    # Compute similarity scores (using cosine similarity in this example)
    
    if sim_score == 'cosine':
        similarity_scores = cosine_similarity(features)
    elif sim_score == 'euclidean':
        similarity_scores = -1*euclidean_distances(features)
    else:
        similarity_scores = sim_score
        
        
#     print(similarity_scores.shape)
    
    
    # Initialize lists to store similarity scores and label matches
    similarity_values = []
    label_matches = []

    # Compute similarity values and label matches
    for i in range(len(features)):
        for j in range(i + 1, len(features)):
            similarity_values.append(similarity_scores[i, j])
            label_matches.append(int(labels[i] == labels[j]))

    # Calculate Pearson's correlation coefficient
    correlation, _ = pearsonr(similarity_values, label_matches)

#     # Plot the correlation
#     plt.scatter(similarity_values, label_matches, alpha=0.5)
#     plt.title(f'Pearson Correlation: {correlation:.2f}')
#     plt.xlabel('Feature Similarity')
#     plt.ylabel('Label Match (1 if same, 0 if different)')
#     plt.show()

    
    return correlation


# test_datsets = ['Texas','Cornell','Wisconsin','Squirrel','Chameleon','Cora']
# test_datsets = ['Texas']
test_datasets = ['pokec','arxiv-year','snap-patents','twitch-gamer']


def node_pearson():

    max_select = 5000
    balance = True

    for dataset_name in test_datsets:    
        data, dataset = get_data(dataset_name, log=False, h_score = False, split_no = 0)


        if data.num_nodes>max_select:

            if balance:
                num_class = torch.max(data.y)+1

                input_nodeidx = torch.arange(data.num_nodes)

                clusters = [[] for i in range(num_class)]

                for i in input_nodeidx:
                    clusters[data.y[i]].append(i.item())

                for i in range(num_class):
                    clusters[i] = torch.LongTensor(clusters[i])

                indices = torch.LongTensor([])
                per_class = int(math.ceil((max_select/num_class)))

                for i in range(num_class):           

                    if len(clusters[i])== 0:
                        continue                
                    indx = torch.randint(len(clusters[i]), (per_class, ))
                    indx= clusters[i][indx]
                    indices = torch.cat((indices,indx))                
            else:
                indices = torch.randint(data.num_nodes, (max_select, ))

            features = data.x[indices]
            labels = data.y[indices]
        else:
            features = data.x
            labels = data.y

        start = time.time()
        print(dataset_name,"\t",pearson_coff(features, labels, sim_score='euclidean'), end='\t')
        end = time.time()
        print("computation time:", end-start)
        
# node_pearson()

In [36]:
import scipy.spatial as sp


def pearson_coff_edges(data, link_batch_size=4096, sim_score='cosine', log = True):
    
    similarity_values = []
    label_matches = []
    
    indices = torch.arange(0, data.edge_index.shape[1])
    batches = torch.split(indices, link_batch_size)
    
    func = cosine_similarity
    
    if log:
        pbar = tqdm(total=len(indices))
        pbar.set_description(f'Batch')
    
    for batch in batches:
        idx = data.edge_index[:,batch]
        x = data.x[idx[0]]
        y = data.x[idx[1]]
        
        #print(x.shape, y.shape)
                
        if sim_score=='cosine':
            sim = torch.cosine_similarity(x,y, dim=1).tolist()                        
        elif sim_score == 'euclidean':
            sim = -1*torch.nn.PairwiseDistance(p=2)(x, y)
        else:
            raise NotImplemented
        
        #print(sim)
        similarity_values.extend(sim)        
                
        l_sim = (data.y[idx[0]] == data.y[idx[1]]).type(torch.int).tolist()
        
        #print(l_sim)
        
        label_matches.extend(l_sim)
        
        if log:
            pbar.update(len(batch))
    
    if log:
        pbar.close()

    correlation = 0
    
    #print(similarity_values, label_matches)
        
    correlation, _ = pearsonr(similarity_values, label_matches)
    
    return correlation



# test_datsets = ['karate']

# for dataset_name in test_datsets:
    
#     data, dataset = get_data(dataset_name, log=False, h_score = False, split_no = 0)
#     start = time.time()
#     print(dataset_name,"\t",pearson_coff_edges(data, link_batch_size =4096*8, sim_score='euclidean', log = False), end='\t')
#     end = time.time()
#     print("computation time:", end-start)

# Link prediction homophily vs label

In [37]:
from ipynb.fs.full.PretrainedLinkFast import train_link, args, device, DIR

In [38]:
# from ipynb.fs.full.Dataset import pearson_coff
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

def normalize_rows(arr):
    row_min = arr.min(axis=1, keepdims=True)
    row_max = arr.max(axis=1, keepdims=True)
    return (arr - row_min) / (row_max - row_min)

In [39]:
def get_model(DATASET_NAME, log = False):
    data, dataset = get_data(DATASET_NAME, log=False, h_score=log, split_no = 0)
    #data.train_mask = data.train_mask|data.val_mask|data.test_mask
    #data.train_mask = data.train_mask|data.val_mask
    
    args.balance = True
    
    if data.num_nodes<10000:
        args.epochs = 20
        args.num_neurons = 32
        args.link_batch_size = 4096
        args.link_num_steps = 200

    elif data.num_nodes<100000:
        args.epochs = 10
        args.num_neurons = 32
        args.link_batch_size = 4096*2
        args.link_num_steps = 200
    else:
        args.epochs = 5
        args.num_neurons = 32
        args.link_batch_size = 4096*8
        args.link_num_steps = 200
        
    worker = 0
#     if data.x.shape[1]>1000:
#         worker = 8
    
    link_model = train_link(data, selfloop = True, log = log, worker = worker)
    
    return data, dataset, link_model

def link_correlation(DATASET_NAME, log = False):
    
    data, dataset, link_model = get_model(DATASET_NAME, log = False)
    
    max_select = 5000
    
    if data.num_nodes>max_select:
        
        if args.balance:
            num_class = torch.max(data.y)+1
            
            input_nodeidx = torch.arange(data.num_nodes)
            
#             if data.num_nodes>100000:
#                 input_nodeidx = torch.nonzero(data.train_mask).flatten()            
#             else:
#                 input_nodeidx = torch.arange(data.num_nodes)
            
            clusters = [[] for i in range(num_class)]

            for i in input_nodeidx:
                clusters[data.y[i]].append(i.item())

            for i in range(num_class):
                clusters[i] = torch.LongTensor(clusters[i])
                
#             if log:
#                 print(clusters)
                            
            indices = torch.LongTensor([])
            per_class = int(math.ceil((max_select/num_class)))
            
            for i in range(num_class): 
                
                if len(clusters[i])== 0:
                    continue
                    
                indx = torch.randint(len(clusters[i]), (per_class, ))
                indx= clusters[i][indx]
                indices = torch.cat((indices,indx))                
        else:
            indices = torch.randint(data.num_nodes, (max_select, ))
            
#         if log:
#             print(indices)
        
        features = data.x[indices]
        labels = data.y[indices]
    else:
        features = data.x
        labels = data.y
        
    
#     print("features: ", features)
#     print("labels: ",labels)
    
    similarity_values = []
    label_matches = []
    
    feature_matrix = features.to(device)
    N = feature_matrix.shape[0]
    A = np.zeros((N,N))

    #print(N)
    
    if log:
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')

    link_model.eval()
    with torch.no_grad():    
        for i in range(N):
            x = feature_matrix[i].repeat(N, 1)
            sim = link_model(x, feature_matrix)            
            
            pred = sim                        
#             pred = torch.zeros_like(sim)
#             pred[sim >= 0.5] = 1            
            pred = pred.cpu().numpy().reshape(-1)            
            A[i] = pred
            if log:
                pbar.update(1)
    if log:
        pbar.close()
    
    if log:
        print(A)
#     A = normalize_rows(A)
#     print(A)
    
#     for i in range(N):
#         for j in range(N):
#             print(f'{A[i,j]:0.2f}',end=' ')
#         print("")
    
    similarity_scores = A
    
    if log:
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')
    # Compute similarity values and label matches
    for i in range(len(features)):
        for j in range(i + 1, len(features)):
            similarity_values.append(similarity_scores[i, j])
            label_matches.append(int(labels[i] == labels[j]))
        if log:
            pbar.update(1)
    if log:
        pbar.close()
    
    
    # Calculate Pearson's correlation coefficient
    correlation, _ = pearsonr(similarity_values, label_matches)
    
    if log:
        print(correlation)
    
    return correlation
    
#     # Plot the correlation
#     plt.scatter(similarity_values, label_matches, alpha=0.5)
#     plt.title(f'Pearson Correlation: {correlation:.2f}')
#     plt.xlabel('Feature Similarity')
#     plt.ylabel('Label Match (1 if same, 0 if different)')
#     plt.show()

# link_correlation('Reddit', log=True)

In [40]:
test_datsets = [
    "karate",
]


In [41]:
def compute_all():
    
    max_select = 5000

    for dataset_name in test_datsets:    
        start = time.time()
        print(dataset_name,"\t",link_correlation(dataset_name, log=False), end='\t')
        end = time.time()
        print("computation time:", end-start)
        
# compute_all()

In [42]:
def pearson_coff_edges(DATASET_NAME, link_batch_size=4096, log = True):
    
    data, dataset, link_model = get_model(DATASET_NAME, log = log)
    
    similarity_values = []
    label_matches = []
    
    indices = torch.arange(0, data.edge_index.shape[1])
    batches = torch.split(indices, link_batch_size)
    
    func = cosine_similarity
    
    if log:
        pbar = tqdm(total=len(indices))
        pbar.set_description(f'Batch')
    
    link_model.eval()
    with torch.no_grad():
        for batch in batches:
            idx = data.edge_index[:,batch]
            x = data.x[idx[0]].to(device)
            y = data.x[idx[1]].to(device)

            sim = link_model(x,y)
            sim = sim.view(-1).cpu().tolist()

            similarity_values.extend(sim)        

            l_sim = (data.y[idx[0]] == data.y[idx[1]]).type(torch.int).tolist()

            #print(l_sim)

            label_matches.extend(l_sim)

            if log:
                pbar.update(len(batch))

        if log:
            pbar.close()

    correlation = 0
    
    #print(similarity_values, label_matches)
        
    correlation, _ = pearsonr(similarity_values, label_matches)
    
    return correlation

test_datsets = ['karate']

def compute_edge_wise():
    
    for dataset_name in test_datsets:

        data, dataset = get_data(dataset_name, log=False, h_score = False, split_no = 0)
        start = time.time()
        print(dataset_name,"\t",pearson_coff_edges(dataset_name, link_batch_size =4096*8, log = True), end='\t')
        end = time.time()
        print("computation time:", end-start)
    
# compute_edge_wise()