In [2]:
import os
import sys
kernel_name = os.path.basename(sys.executable.replace("/bin/python",""))

if kernel_name == 'py38cu11':
    import ctypes
    ctypes.cdll.LoadLibrary("/apps/gilbreth/cuda-toolkit/cuda-11.2.0/lib64/libcusparse.so.11");
    ctypes.cdll.LoadLibrary("/apps/gilbreth/cuda-toolkit/cuda-11.2.0/lib64/libcublas.so.11");

In [3]:
from pathlib import Path
import pandas as pd
import os
import json
import numpy as np
from tqdm import tqdm

from torch_geometric.data import Data, Dataset
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.datasets import Reddit, Reddit2, Flickr, Yelp, AmazonProducts, PPI,  OGB_MAG,  FakeDataset, Amazon, Coauthor

#import ipynb.fs.full.utils.MoonGraph as MoonGraph
# import utils.MoonGraph as MoonGraph
import torch_geometric.utils.homophily as homophily
import torch_geometric.utils.subgraph as subgraph

#import torch_geometric.utils.assortativity as assortativity #pytorch geometric's latest version has this

In [4]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from scipy.stats import norm, gamma, uniform, expon

In [5]:
import torch
import torch.nn.functional as F
from torch import Tensor
from torch_geometric.typing import Adj, SparseTensor
from torch_geometric.utils import coalesce, degree
from torch_geometric.utils.to_dense_adj import to_dense_adj


def assortativity(edge_index: Adj) -> float:
    if isinstance(edge_index, SparseTensor):
        adj: SparseTensor = edge_index
        row, col, _ = adj.coo()
    else:
        assert isinstance(edge_index, Tensor)
        row, col = edge_index

    device = row.device
    out_deg = degree(row, dtype=torch.long)
    in_deg = degree(col, dtype=torch.long)
    degrees = torch.unique(torch.cat([out_deg, in_deg]))
    mapping = row.new_zeros(degrees.max().item() + 1)
    mapping[degrees] = torch.arange(degrees.size(0), device=device)

    # Compute degree mixing matrix (joint probability distribution) `M`
    num_degrees = degrees.size(0)
    src_deg = mapping[out_deg[row]]
    dst_deg = mapping[in_deg[col]]

    pairs = torch.stack([src_deg, dst_deg], dim=0)
    occurrence = torch.ones(pairs.size(1), device=device)
    pairs, occurrence = coalesce(pairs, occurrence)
    M = to_dense_adj(pairs, edge_attr=occurrence, max_num_nodes=num_degrees)[0]
    # normalization
    M /= M.sum()

    # numeric assortativity coefficient, computed by
    # Pearson correlation coefficient of the node degrees
    x = y = degrees.float()
    a, b = M.sum(0), M.sum(1)

    vara = (a * x**2).sum() - ((a * x).sum())**2
    varb = (b * x**2).sum() - ((b * x).sum())**2
    xy = torch.outer(x, y)
    ab = torch.outer(a, b)
    out = (xy * (M - ab)).sum() / (vara * varb).sqrt()
    return out.item()

In [6]:
#https://github.com/SitaoLuan/ACM-GNN/tree/main/synthetic-experiments
#https://github.com/siddhartha047/Geom_GCN_pytorch_implementation
from sklearn.model_selection import train_test_split

def func(feature):

    f = list(map(int, feature.split(',')))
    
    return f

def get_heterophily(root, DATASET_NAME='texas', train=0.6, val=0.2, test=0.2):
    
    edge_file = root+'/'+DATASET_NAME+'/out1_graph_edges.txt'
    id_feature_label_file = root+'/'+DATASET_NAME+'/out1_node_feature_label.txt'
    
    edges = pd.read_csv(edge_file, sep='\t', header=0)
    id_feature_label = pd.read_csv(id_feature_label_file, sep='\t', header=0)
    
#     print(edges)
#     print(id_feature_label)
    
    edge_index = torch.LongTensor(edges.values.tolist()).T
    node_id  = torch.LongTensor(id_feature_label['node_id'].values.tolist())
    y = torch.LongTensor(id_feature_label['label'].values.tolist())
    x = id_feature_label['feature'].apply(func)
    x = torch.Tensor(x.values.tolist())
    
    N = len(node_id)
    indexs = list(range(N))
    
    train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
    val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)

    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    
    data = Data(edge_index=edge_index, 
                x=x, node_id=node_id, 
                y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
    
    return data

#get_heterophily('/scratch/gilbreth/das90/Dataset/heterophily/','squirrel')

In [7]:
def get_film(root, DATASET_NAME='film', train=0.6, val=0.2, test=0.2):
    
    file = root+DATASET_NAME+'/'    
    f = open(file+'class_map.json')
    class_map = json.load(f)
    class_map = {int(key):int(value) for key, value in class_map.items()}
    #print(class_map)
    f.close()    
    
    y = list(class_map.values())
    x = np.load(file+'feats.npy')    
    #print(x.shape)
    
#     f = open(file+'id_map.json')
#     id_map = json.load(f)
#     id_map = {int(key):int(value) for key, value in id_map.items()}    
#     #print(id_map)
#     f.close()
    
    #target = pd.read_csv(file+'film_target.csv', sep=',', header=0)
    #print(target)
    #target['new_id']=target['id'].apply(lambda x: id_map[x])
    
    
    edges = pd.read_csv(file+'film_edges.csv', sep=',', header=0)
    
    u = edges['id1'].values.tolist()
    v = edges['id2'].values.tolist()
    
    edge_index=[u,v]
    
    x = torch.Tensor(x)
    y = torch.LongTensor(y)
    edge_index = torch.LongTensor(edge_index)
    
    N = x.shape[0]
    indexs = list(range(N))
    train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
    val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)

#     train_index, test_index = train_test_split(indexs, test_size=val+test)
#     val_index, test_index = train_test_split(test_index, test_size=test/(val+test))
    
    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    
    data = Data(edge_index=edge_index, 
                x=x,
                y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
    
    return data

#get_film('/scratch/gilbreth/das90/Dataset/heterophily/','film')

In [8]:
class HeteroDataset(Dataset):
    def __init__(self, root, dataset_name, train=0.6, val=0.2, test=0.2,
                 transform=None, pre_transform=None, pre_filter=None):
        super().__init__(None, transform, pre_transform, pre_filter)
        
        self.root = root
        self.dataset_name=dataset_name
        self.degree=degree
        self.train=train
        self.val=val
        self.test=test
        
        if dataset_name == 'film':
            self.data = get_film(root,dataset_name, train, val, test)
        else:
            self.data = get_heterophily(root,dataset_name, train, val, test)

    @property
    def processed_file_names(self):
        return self.dataset_name
    
    @property
    def processed_dir(self):
        return self.root

    @property
    def num_node_features(self):
        return self.data.x.shape[1]
    
    @property
    def num_classes(self):
        return max(self.data.y).item()+1

    def len(self):
        return 1

    def get(self, idx):
        
        return self.data
    
# dataset = HeteroDataset('/scratch/gilbreth/das90/Dataset/heterophily/','texas')
# data = dataset[0]

In [9]:
def train_val_test_mask(data, train=0.6, val=0.2, test=0.2, random_state=False):
    
    N = data.x.shape[0]
    indexs = list(range(N))
    
    if random_state:
        train_index, test_index = train_test_split(indexs, test_size=val+test)
        val_index, test_index = train_test_split(test_index, test_size=test/(val+test))
    else:        
        train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
        val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)



    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask
    
    return data

In [13]:
def get_data(DATASET_NAME='Cora', DIR=None, params=None, train=None, random_state=False, log=True, h_score=False, split_no=0):
    
    if DIR is not None:
        print('Looking at: ',DIR)    
    elif os.uname()[1].find('gilbreth')==0: ##if not darwin(mac/locallaptop)
        DIR='/scratch/gilbreth/das90/Dataset/'
    elif os.uname()[1].find('unimodular')==0:
        DIR='/scratch2/das90/Dataset/'
    elif os.uname()[1].find('Siddharthas')==0:
        DIR='/Users/siddharthashankardas/Purdue/Dataset/'  
    else:
        DIR='./Dataset/'

    Path(DIR).mkdir(parents=True, exist_ok=True)

    RESULTS_DIR=DIR+'RESULTS/'
    Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)

    if log:
        print("Data directory: ", DIR)
        print("Result directory:", RESULTS_DIR)
    
    from torch_geometric.datasets import Planetoid,  KarateClub, CitationFull
    from torch_geometric.transforms import NormalizeFeatures
    from torch_geometric.datasets import Reddit, Reddit2
    
    #DATASET_NAME='Cora' #"Cora", "CiteSeer", "PubMed"

    if DATASET_NAME in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(root=DIR+'Planetoid', name=DATASET_NAME, transform=NormalizeFeatures())
    
    elif DATASET_NAME in ['cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed']:
        #['cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed']
        dataset = CitationFull(root=DIR+'Citation', name=DATASET_NAME, transform=NormalizeFeatures())
    
    elif DATASET_NAME == "Reddit2":
        #dataset = Reddit2(root=DIR+'Reddit2', transform=NormalizeFeatures())
        dataset = Reddit2(root=DIR+'Reddit2')

    elif DATASET_NAME == "Reddit":
        #dataset = Reddit(root=DIR+'Reddit', transform=NormalizeFeatures())
        dataset = Reddit(root=DIR+'Reddit')
        
    elif DATASET_NAME == "AmazonProducts":
        #dataset = AmazonProducts(root=DIR+'AmazonProducts', transform=NormalizeFeatures())
        dataset = AmazonProducts(root=DIR+'AmazonProducts')
        
    elif DATASET_NAME in ['Computers', 'Photo']:
        #dataset = Amazon(root=DIR+'AmazonProducts', transform=NormalizeFeatures())
        dataset = Amazon(root=DIR+'Amazon/', name = DATASET_NAME)        
        

    elif DATASET_NAME in ['CS', 'Physics']:
        dataset = Coauthor(root=DIR+'Coauthor/', name = DATASET_NAME)        

        
    elif DATASET_NAME == "Moon":
        dataset = MoonGraph.MoonDataset(n_samples=100, degree=5, train=0.5)    
        G, data =dataset[0]
    
    elif DATASET_NAME == "karate":
        dataset = KarateClub()        
        data = dataset[0]
        data.val_mask = ~data.train_mask
        data.test_mask = data.val_mask
    
    elif DATASET_NAME == "Fake":
        dataset = FakeDataset(num_graphs = 1, 
                              avg_num_nodes = 2000, 
                              avg_degree = 10, 
                              num_channels = 64, 
                              edge_dim = 0, 
                              num_classes = 10, 
                              task = 'auto', 
                              is_undirected = True,                               
                              transform=NormalizeFeatures())
        
    elif DATASET_NAME == "OGB_MAG":
        #dataset = OGB_MAG(root=DIR+'OGB_MAG', preprocess='metapath2vec', transform=NormalizeFeatures())
        dataset = OGB_MAG(root=DIR+'OGB_MAG2', preprocess='metapath2vec')
        data = dataset[0]        
        print(dataset, data, data['paper'])
        
        return dataset
        
    elif DATASET_NAME == "Flickr":
        dataset = Flickr(root=DIR+'Flickr')
    
    elif DATASET_NAME == "Yelp":
        dataset = Yelp(root=DIR+'Yelp')
    
    elif DATASET_NAME == "PPI":
        
        dataset = PPI(root=DIR+'PPI')
        
        return dataset
    
    ###heterophilic dataset
    #https://github.com/pyg-team/pytorch_geometric/blob/master/examples/linkx.py
    elif DATASET_NAME in ["penn94", "reed98", "amherst41", "cornell5", "johnshopkins55", "genius"]:
        from ipynb.fs.full.HeterophilousDataset import LINKXDataset
        
        dataset = LINKXDataset(DIR+'/Heterophilous/', DATASET_NAME)    
        #transform=NormalizeFeatures())
        
    elif DATASET_NAME in ["Roman-empire", "Amazon-ratings", "Minesweeper", "Tolokers", "Questions"]:
        from ipynb.fs.full.HeterophilousDataset import HeterophilousGraphDataset
        
        dataset = HeterophilousGraphDataset(DIR+'/Heterophilous/', DATASET_NAME)    
        #transform=NormalizeFeatures())
    elif DATASET_NAME == 'Actor':
        from ipynb.fs.full.HeterophilousDataset import Actor
        
        dataset = Actor(root=DIR+'/Heterophilous/Actor')
        #transform=NormalizeFeatures())
    
    elif DATASET_NAME in ["Cornell", "Texas", "Wisconsin"]:
        from ipynb.fs.full.HeterophilousDataset import WebKB
        
        dataset = WebKB(DIR+'/Heterophilous/', DATASET_NAME)
        #transform=NormalizeFeatures())
        
    elif DATASET_NAME in ["Chameleon", "Crocodile", "Squirrel"]:
        from ipynb.fs.full.HeterophilousDataset import WikipediaNetwork
        
        if DATASET_NAME == 'Crocodile':
            dataset = WikipediaNetwork(DIR+'/Heterophilous/', DATASET_NAME.lower(), geom_gcn_preprocess= False)
        else:        
            dataset = WikipediaNetwork(DIR+'/Heterophilous/', DATASET_NAME.lower())
            #transform=NormalizeFeatures())
    
    #implemented heterophily
    elif DATASET_NAME in ['chameleon','cornell','film', 'squirrel', 'texas','wisconsin']:
        dataset = HeteroDataset(DIR+'/heterophily/', DATASET_NAME)       
    
    else:    
        raise Exception('dataset not found')

    if DATASET_NAME in ['Moon', 'karate']:
        #MoonGraph.draw_blobs_data(G, data)        
        None
    else:
        data = dataset[0]  # Get the first graph object.
        if 'train_mask' not in data:
            data = train_val_test_mask(data, train=0.6, val=0.2, test=0.2, random_state=random_state)
            
        elif data.train_mask.dim()>1:
            data.train_mask = data.train_mask[:,split_no]
            data.val_mask = data.val_mask[:,split_no]
            data.test_mask = data.test_mask[:,split_no]
            
        
    if train is not None:
        val = (1-train)/2.0
        data = train_val_test_mask(data, train=train, val=val, test=1-(train+val), random_state=random_state)
        
    if log:
        print()
        print(f'Dataset: {dataset}:')
        print('======================')
        print(f'Number of graphs: {len(dataset)}')
        print(f'Number of features: {dataset.num_features}')
        print(f'Number of classes: {dataset.num_classes}')
        print()
        print(data)
        print('===========================================================================================================')

        # Gather some statistics about the graph.
        print(f'Number of nodes: {data.num_nodes}')
        print(f'Number of edges: {data.num_edges}')
        print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
        print(f'Number of training nodes: {data.train_mask.sum()}')
        print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
        print(f'Has isolated nodes: {data.has_isolated_nodes()}')
        print(f'Has self-loops: {data.has_self_loops()}')
        print(f'Is undirected: {data.is_undirected()}')
    
    if len(data.y.shape) == 1:
        labels = data.y
    else:
        if log: print("Testing homophily by converting multi-label to one-label")
        labels = data.y.argmax(dim=1)
    
    if h_score:
        print(homophily(data.edge_index, labels, method='node'),homophily(data.edge_index, labels, method='edge'), end=' ')
        
        try:
            esen = homophily(data.edge_index, labels, method='edge_insensitive')
        except:
            esen = -1        
        print(esen, end=' ')            
        print(assortativity(data.edge_index))
        
    
    return data, dataset

In [11]:
datasets = [
    'karate','Moon','Fake',
    "Cora", "CiteSeer", "PubMed",'cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed',
    'chameleon','cornell','film', 'squirrel', 'texas','wisconsin',
    'Computers', 'Photo',
    'CS', 'Physics',
    'Flickr','Yelp',
    "penn94", "reed98", "amherst41", "cornell5", "johnshopkins55", "genius",
    "Roman-empire", "Amazon-ratings", "Minesweeper", "Tolokers", "Questions",
    "Actor", 
    "Cornell", "Texas", "Wisconsin", 
    "Chameleon", "Squirrel", #"Crocodile",
    'Reddit',
    'Reddit2',
    'AmazonProducts' 
]

In [15]:
if __name__ == '__main__':   
#     data, dataset = get_data('karate', log=True, h_score = True)
#     print(sum(torch.where(data.train_mask==True)))
#     data, dataset = get_data('Cora', train=0.2, random_state=True)
#     print(sum(torch.where(data.train_mask==True)))
    
    data, dataset = get_data('wisconsin', log=True, h_score = True)
    
    None

Data directory:  /scratch/gilbreth/das90/Dataset/
Result directory: /scratch/gilbreth/das90/Dataset/RESULTS/

Dataset: HeteroDataset():
Number of graphs: 1
Number of features: 1703
Number of classes: 5

Data(x=[251, 1703], edge_index=[2, 515], y=[251], node_id=[251], train_mask=[251], val_mask=[251], test_mask=[251])
Number of nodes: 251
Number of edges: 515
Average node degree: 2.05
Number of training nodes: 150
Training node label rate: 0.60
Has isolated nodes: False
Has self-loops: True
Is undirected: False
0.17191579937934875 0.19611650705337524 0.0839187353849411 -0.27229174971580505


## Homophily plots

In [12]:
# dataset = MoonGraph.MoonDataset(n_samples=1000, degree=5, train=0.5)    
# G, data =dataset[0] 
# # MoonGraph.draw_blobs_data(G,data)

In [13]:
# DATASET_NAME = 'Cora'
# data, dataset = get_data(DATASET_NAME)
# # data.y = data.y.argmax(dim=1)

In [14]:
def hp_compute(data):
    
    N = data.num_nodes
    E = data.num_edges
    
    adj = SparseTensor(
        row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))
    
    hp_data=np.zeros(N)
    
    pbar = tqdm(total=N)
    pbar.set_description(f'Nodes')
        
    for i in range(N):
        row, col, edge = adj[i,:].coo()      
        
        if len(col) == 0: 
            pbar.update(1)
            continue
        
        y_current = data.y[i]
        y_neighbors = data.y[col]
        
        match  = (y_neighbors==y_current).type(torch.int).sum()
        
        hp_data[i] = match.item()/len(y_neighbors)
        
        #print(y_current, y_neighbors, match, hp_data[i])
        
        pbar.update(1)
            
    pbar.close()
    
    return hp_data

#hp_data = hp_compute(data)

In [15]:
def pd_hist(data, DATASET_NAME=''):
    # Generate some random data
    #data = np.random.normal(size=1000)
    # Calculate the probability density function
    density, bins, _ = plt.hist(data, density=True, bins=25)

    # Plot the probability density function
    plt.plot(bins[:-1], density)

    # Add labels and title
    plt.xlabel('Value')
    plt.ylabel('Probability Density')
    plt.title('Probability Density Function'+' '+DATASET_NAME)

    # Show the plot
    plt.show()
#hp_data = [0.01, 0.1,0.6]

In [16]:
# hp_data = hp_compute(data)
# pd_hist(hp_data, DATASET_NAME)

## Gephi Graph

In [17]:
import networkx as nx
from torch_geometric.utils import to_networkx
# from ipynb.fs.full.utils.GNNutils import save_gephi_graph

In [18]:
def create_gephi_graph(data, DATASET_NAME):
    G_fillename = '/scratch/gilbreth/das90/Dataset/GephiGraphs/'+DATASET_NAME

    if os.path.exists(G_fillename)==False:
        print("Graph is not found, creating it....")
        G = to_networkx(data, to_undirected=True)
        nx.write_gpickle(G, G_fillename)
        print("Done")
    else:
        print("Loading Saved graph...")
        G = nx.read_gpickle(G_fillename)
        print("Done")
    
    graph_name = '/scratch/gilbreth/das90/Dataset/GephiGraphs/'+DATASET_NAME+'_original'
    save_gephi_graph(G, data.y, graph_name)
    
    return

In [19]:
# datasets = [
#     'karate','Moon','Fake',
#     "Cora", "CiteSeer", "PubMed",'cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed',
#     'chameleon','cornell','film', 'squirrel', 'texas','wisconsin',
#     'Computers', 'Photo',
#     'CS', 'Physics',
#     'Flickr','Yelp',
#     "penn94", "reed98", "amherst41", "cornell5", "johnshopkins55", "genius",
#     "Roman-empire", "Amazon-ratings", "Minesweeper", "Tolokers", "Questions",
#     "Actor", 
#     "Cornell", "Texas", "Wisconsin", 
#     "Chameleon", "Squirrel", #"Crocodile",
#     'Reddit',
#     'Reddit2',
#     'AmazonProducts' 
# ]

In [20]:
# for DATASET_NAME in datasets:
#     #DATASET_NAME = 'karate'
#     print('-'*20,DATASET_NAME,'-'*20)
#     data, dataset = get_data(DATASET_NAME)
#     if data.y.ndim >1:
#         data.y = data.y.argmax(dim=1)
#     create_gephi_graph(data,DATASET_NAME)

In [21]:
#create_gephi_graph(DATASET_NAME)

In [22]:
# x = torch.Tensor([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
# y = torch.LongTensor([0,0,0, 1, 1, 1, 1])
# edge_index = torch.LongTensor([[1,2],[1,4],[1,5],[2,1],[3,6],[3,7],[4,5],[4,1],[4,6],[4,7],[5,1],[5,4],[5,6],[6,3],[6,4],[6,5],[6,7],[7,3],[7,4],[7,6]]).T
# edge_index = edge_index-1
# data = Data(x=x, y=y, edge_index = edge_index)
# draw_graph(edge_index, y, 7)

# Create Synthetic Graphs of different homophily scores and degree of different datasets

In [23]:
# data, dataset = get_data('karate', log=True, h_score=True)

In [24]:
def balance_class(data):
    
    unique_elements, counts = torch.unique(data.y, return_counts=True)
#     print(unique_elements)
#     print(counts)
    mincount = min(counts).item()
#     print(mincount)

    subset = []

    for i in unique_elements:
        indexes = ((data.y == i).nonzero()).view(-1).numpy()
#         print(len(indexes))
#         print(indexes)
        samples = np.random.choice(indexes, mincount, replace=False)
        subset.extend(samples)

#     print(len(subset), mincount*num_classes)

    node_idx = torch.tensor(subset)
    edge_index = subgraph(node_idx, data.edge_index)[0]
    
#     print(node_idx, edge_index)
    
    N = data.num_nodes
    E = data.num_edges

    data.num_nodes = node_idx.size(0)
    data.edge_index = edge_index

    for key, item in data:
        if key in ['edge_index', 'num_nodes']:
            continue
        if isinstance(item, torch.Tensor) and item.size(0) == N:
            data[key] = item[node_idx]
        elif isinstance(item, torch.Tensor) and item.size(0) == E:
            data[key] = item[edge_idx]
        else:
            data[key] = item
    
    return data

# print(data.edge_index)
# data = balance_class(data)
# print(data.edge_index)

In [25]:
def generate_synthetic(data, d=5, h=0.8, train=0.6, random_state=None, log=True, balance = False):
    
    if balance:
        data = balance_class(data)
        
    num_class = max(data.y)+1
    cluster_vertices = {}
    for c in range(num_class):
        indices = torch.where(data.y == c)[0]
        cluster_vertices[c]=indices
    
    n = data.num_nodes
    
#     intra_d = np.random.multinomial(n*d*h, np.ones(n)/n, size=1)[0]
#     inter_d = np.random.multinomial(n*d*(1-h), np.ones(n)/n, size=1)[0]
    
    intra_d = np.round(np.ones(n)*(d*h)).astype(int)
    inter_d = np.round(np.ones(n)*(d*(1-h))).astype(int)
    
#     print(intra_d, inter_d)
    
    edge_index = [[],[]]
    
    for c in range(num_class):
        intra_vertices = cluster_vertices[c]
        inter_vertices = torch.cat([value for key, value in cluster_vertices.items() if key!=c])
        
        intra_vertices = intra_vertices.numpy()
        inter_vertices = inter_vertices.numpy()
        
#         print('Class:', c)
#         print(intra_vertices)
#         print(inter_vertices)
        
        for u in intra_vertices:
            
            ## remove self-loop
            #intra_vertices_u = 
            
            intra_v = np.random.choice(intra_vertices, min(len(intra_vertices),intra_d[u]), replace=False)
            inter_v = np.random.choice(inter_vertices, min(len(inter_vertices),inter_d[u]), replace=False)
            
            Vs = np.append(intra_v,inter_v)
            Us = np.repeat(u,len(Vs))
            
            unique_elements, counts = np.unique(inter_v, return_counts=True)
            
#             print("-"*50)
#             print(u)
#             print(Vs)
#             print(unique_elements)
#             print(counts)
#             print("-"*50)
            
#             if len(unique_elements)< (num_class-1):
#                 print('un du toa:')
#                 print(unique_elements)
#                 print(counts)
            
            edge_index[0].extend(Us)
            edge_index[1].extend(Vs)
             
#             edge_index[1].extend(Us)
#             edge_index[0].extend(Vs)
    
    data.edge_index = torch.LongTensor(edge_index)
    
    if train is not None:
        val = (1-train)/2.0
        data = train_val_test_mask(data, train=train, val=val, test=1-(train+val), random_state=random_state)
    
    if log:
        print(f"{homophily(data.edge_index, data.y, method='node'):0.4f}",end=' ')
        print(f"{homophily(data.edge_index, data.y, method='edge'):0.4f}",end=' ')
        print(f"{homophily(data.edge_index, data.y, method='edge_insensitive'):0.4f}",end=' ')
        print(f"{assortativity(data.edge_index):0.4f}", end=' ')
        
        unique_elements, counts = torch.unique(data.y, return_counts=True)
        print(unique_elements.tolist(), counts.tolist(), end=' ')

        print(f'{int(data.train_mask.sum()) / data.num_nodes:.4f}', end=' ')
    
    return data

# data = generate_synthetic(data, d=10, h=0.0, train=0.6, random_state=None, log=True, balance = True)
# data

# Diversity heterophily definition

In [26]:
import numpy as np
from scipy.stats import chisquare

In [27]:
# DATASET_NAME = 'chameleon'
# data, dataset = get_data(DATASET_NAME, log = False)
# data = generate_synthetic(data, d=10, h=0.0, train=0.6, random_state=1, log=True, balance = False)
# num_classes = dataset.num_classes
# print(num_classes)
# N = data.num_nodes
# E = data.num_edges

# adj = SparseTensor(row=data.edge_index[0], col=data.edge_index[1],
#     value=torch.arange(E, device=data.edge_index.device),
#     sparse_sizes=(N, N))

# # if len(col)==0:
# #     return 0

In [28]:
def given_uniformity(col_labels, num_classes, cur_y, log=True, ):    
    if log:
        print(col_labels)
        print(cur_y)
        
    unique_elements, counts = np.unique(col_labels, return_counts=True)
    
    if log:
        print(unique_elements)
        print(counts)
    
    index = np.where(unique_elements == cur_y)[0]
    unique_elements = np.delete(unique_elements, index)
    counts = np.delete(counts, index)

    if log: 
        print(unique_elements)
        print(counts)
        
    if len(unique_elements)<(num_classes-1):
        if log:
            print('unexpted: ')
            print(unique_elements)
            print(counts)
        return -1
    
    expected_frequency = sum(counts) / (num_classes-1)
    observed_frequency = np.array(counts)

    if log:
        print(expected_frequency)
        print(observed_frequency)

    chi2, p_value = chisquare(observed_frequency, f_exp=expected_frequency)
    
    if log:
        print("Chi-squared statistic:", chi2)
        print("p-value:", p_value)

    significance_level = 0.05
    if p_value < significance_level:
        if log:print("The distribution significantly deviates from the uniform distribution.")
        return 0
    else:
        if log:print("The distribution is similar to the uniform distribution.")
        return 1

    
# u = 0
# row, col, ed = adj[u,:].coo()   
# col_labels = data.y[col]

# print(row)
# print(col)
# print(col_labels)
# print(len(col_labels))

# cur_y = data.y[u].item()
# print(cur_y)
# given_uniformity(col_labels, num_classes, cur_y)

In [29]:
def test_uniformity(data, num_classes, log=True):
    
    N = data.num_nodes
    E = data.num_edges
    
    adj = SparseTensor(row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))
    
    if log:
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')

    count = 0
    
    for u in range(N):            
        row, col, edge_index = adj[u,:].coo()   
        col_labels = data.y[col]
        cur_y = data.y[u].item()
        is_diverse = given_uniformity(col_labels, num_classes, cur_y, log = False)
        
#         if is_diverse ==-1:
#             print(u)
        
        count+=max(0,is_diverse)
        if log:
            pbar.update(1)
    if log:
        pbar.close()
    
    return count, count/N
    
# test_uniformity(data, num_classes)

In [30]:
def agg_homophily(data, matrix_tu = 'affinity'):
    N = data.num_nodes
    E = data.num_edges

    adj = SparseTensor(row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))

    A = torch.zeros((N,N))
    edges = data.edge_index.t()
    A[edges[:,0], edges[:,1]] = 1
    A[edges[:,1], edges[:,0]] = 1    
    
    I = torch.eye(N)
    D = torch.diag(torch.Tensor(torch.sum(A,dim=1)))
#     print(A)
#     print(D)

    Ai = A + I
    DiInv = torch.diag(torch.Tensor(1/torch.sum(Ai,dim=1)))
    Arw = torch.mm(DiInv, Ai)
    
    M = Arw
    
    if matrix_tu == 'affinity':
        M = Arw
    elif matrix_tu == 'laplacian':
        M = I - Arw
    
    AX = torch.mm(M,data.x)
    #print(AX)
    SAX = torch.mm(AX,AX.T)    
    #print(SAX)
    
    count = 0
    
    for u in range(N):            
        row, col, edge_index = adj[u,:].coo()
        
        col_labels = data.y[col]
        cur_y = data.y[u]
        
#         print(col_labels)
#         print(cur_y)
        
        zu_eq = col[torch.where(col_labels == cur_y)[0]]
        zu_neq = col[torch.where(col_labels != cur_y)[0]]
        #print(zu_eq, zu_neq)
        
        left_u = torch.mean(SAX[u,zu_eq])
        right_u = torch.mean(SAX[u,zu_neq])
        #print(left_u, right_u)
        
        if torch.isnan(right_u):
            count+=1        
                            
        elif (left_u >= right_u):
            count+=1
        
    return count/N
    
    
# DATASET_NAME = 'karate'
# data, dataset = get_data(DATASET_NAME, log=False)
# data = generate_synthetic(data, d=10, h = 0.2, train=0.6, random_state=1, log=False, balance = True)
# data.x = F.one_hot(data.y).float()

# print(agg_homophily(data, 'affinity'))
# print(agg_homophily(data, 'laplacian'))

In [31]:
# DATASET_NAME = 'karate'
# data, dataset = get_data(DATASET_NAME, log=False)
# data = generate_synthetic(data, d=10, h = 0.2, train=0.6, random_state=1, log=False, balance = True)

# num_classes = dataset.num_classes
# print(num_classes)
# N = data.num_nodes
# E = data.num_edges

# adj = SparseTensor(row=data.edge_index[0], col=data.edge_index[1],
#     value=torch.arange(E, device=data.edge_index.device),
#     sparse_sizes=(N, N))

In [32]:
from scipy.stats import entropy

def scipyentropy(labels, num_class = 10):
    class_counts = np.bincount(labels)
    class_probs = class_counts / len(labels)
    
    print(class_probs)

    # Calculate entropy
    entropy_value = entropy(class_probs, base=2)
    print("Entropy:", entropy_value)
    
    return entropy_value

# labels = [0, 1, 2, 0, 1, 1, 3, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6]
# scipyentropy(labels)

In [33]:
import math

def compute_entropy(class_probabilities, num_classes):
    entropy = 0.0
    
    for probability in class_probabilities:
        if probability != 0.0:
            entropy += -probability * math.log2(probability)

    max_entropy = -math.log2(1/num_classes)
    
    #print(max_entropy)    
    normalized_entropy = entropy / max_entropy
    
    return normalized_entropy

# Example usage
# class_probabilities = [0.9, 0.0, 0.0, 0.0, 0.1]
# entropy_value = compute_entropy(class_probabilities, len(class_probabilities))
# print("Normalized Entropy:", entropy_value)

In [34]:
def node_entropy(col_labels, num_classes, cur_y, log=True):    
    if log:
        print(col_labels)
        print(cur_y)
        
    unique_elements, counts = np.unique(col_labels, return_counts=True)
    
    if log:
        print(unique_elements)
        print(counts)
    
    index = np.where(unique_elements == cur_y)[0]
    unique_elements = np.delete(unique_elements, index)
    counts = np.delete(counts, index)

    if log: 
        print(unique_elements)
        print(counts)
        
    
    prob = counts/sum(counts)
    if log: print(prob)
    
    return compute_entropy(prob, num_classes-1)


# u = 0
# row, col, ed = adj[u,:].coo()   
# col_labels = data.y[col]

# print(row)
# print(col)
# print(col_labels)
# print(len(col_labels))

# cur_y = data.y[u].item()
# print(cur_y)
# node_entropy(col_labels, num_classes, cur_y, log = False)

In [35]:
def total_entropy(data, num_classes, log=True):
    
    N = data.num_nodes
    E = data.num_edges
    
    adj = SparseTensor(row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))
    
    if log:
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')

    count = 0
    
    for u in range(N):            
        row, col, edge_index = adj[u,:].coo()   
        col_labels = data.y[col]
        cur_y = data.y[u].item()
        is_diverse = node_entropy(col_labels, num_classes, cur_y, log = False)
        
#         if is_diverse ==-1:
#             print(u)
        
        count+= is_diverse
        if log:
            pbar.update(1)
    if log:
        pbar.close()
    
    return count, count/N

# total_entropy(data, num_classes)

In [36]:
def test_hetero():
    d = 20
    for h in np.array(range(0,21))/20:
        DATASET_NAME = 'Cora'
        data, dataset = get_data(DATASET_NAME, log=False)
        data = generate_synthetic(data, d=d, h = h, train=0.6, random_state=1, log=False, balance = True)
        num_classes = dataset.num_classes
        print('d ', d, ' h', h, end=' ')
        count, score = test_uniformity(data, num_classes, log=False)
        print(count, score, end = ' ')
#         Hagg_aff = agg_homophily(data, matrix_tu = 'affinity')
#         Hagg_lap = agg_homophily(data, matrix_tu = 'laplacian')
#         print(Hagg_aff, Hagg_lap)
        total_en, en_score = total_entropy(data, num_classes, log=False)
        print(total_en, en_score)
    
# test_hetero()