In [1]:
import os
import sys
kernel_name = os.path.basename(sys.executable.replace("/bin/python",""))

if kernel_name == 'py38cu11':
    import ctypes
    ctypes.cdll.LoadLibrary("/apps/gilbreth/cuda-toolkit/cuda-11.2.0/lib64/libcusparse.so.11");
    ctypes.cdll.LoadLibrary("/apps/gilbreth/cuda-toolkit/cuda-11.2.0/lib64/libcublas.so.11");

In [2]:
from pathlib import Path
import pandas as pd
import os
import json
import numpy as np
from tqdm import tqdm
import random
import torch
import torch
import torch.nn.functional as F
from torch import Tensor
from torch_geometric.typing import Adj, SparseTensor
from torch_geometric.utils import coalesce, degree
# from torch_geometric.utils.to_dense_adj import to_dense_adj

from torch_geometric.data import Data, Dataset
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.datasets import Reddit, Flickr, Yelp, AmazonProducts, PPI,  OGB_MAG,  FakeDataset, Amazon, Coauthor
# import torch_geometric.utils as homophily
# import torch_geometric.utils as subgraph
from torch_geometric.utils import assortativity, subgraph, homophily, to_dense_adj

In [3]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from scipy.stats import norm, gamma, uniform, expon

In [4]:
#https://github.com/SitaoLuan/ACM-GNN/tree/main/synthetic-experiments
#https://github.com/KAIDI3270/Geom_GCN_pytorch_implementation

from sklearn.model_selection import train_test_split

def func(feature):

    f = list(map(int, feature.split(',')))
    
    return f

def get_heterophily(root, DATASET_NAME='texas', train=0.6, val=0.2, test=0.2):
    
    edge_file = root+'/'+DATASET_NAME+'/out1_graph_edges.txt'
    id_feature_label_file = root+'/'+DATASET_NAME+'/out1_node_feature_label.txt'
    
    edges = pd.read_csv(edge_file, sep='\t', header=0)
    id_feature_label = pd.read_csv(id_feature_label_file, sep='\t', header=0)
    
#     print(edges)
#     print(id_feature_label)
    
    edge_index = torch.LongTensor(edges.values.tolist()).T
    node_id  = torch.LongTensor(id_feature_label['node_id'].values.tolist())
    y = torch.LongTensor(id_feature_label['label'].values.tolist())
    x = id_feature_label['feature'].apply(func)
    x = torch.Tensor(x.values.tolist())
    
    N = len(node_id)
    indexs = list(range(N))
    
    train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
    val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)

    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    
    data = Data(edge_index=edge_index, 
                x=x, node_id=node_id, 
                y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
    
    return data

#get_heterophily('/scratch/gilbreth/das90/Dataset/heterophily/','squirrel')

In [5]:
def get_film(root, DATASET_NAME='film', train=0.6, val=0.2, test=0.2):
    
    file = root+DATASET_NAME+'/'    
    f = open(file+'class_map.json')
    class_map = json.load(f)
    class_map = {int(key):int(value) for key, value in class_map.items()}
    #print(class_map)
    f.close()    
    
    y = list(class_map.values())
    x = np.load(file+'feats.npy')    
    #print(x.shape)
    
#     f = open(file+'id_map.json')
#     id_map = json.load(f)
#     id_map = {int(key):int(value) for key, value in id_map.items()}    
#     #print(id_map)
#     f.close()
    
    #target = pd.read_csv(file+'film_target.csv', sep=',', header=0)
    #print(target)
    #target['new_id']=target['id'].apply(lambda x: id_map[x])
    
    
    edges = pd.read_csv(file+'film_edges.csv', sep=',', header=0)
    
    u = edges['id1'].values.tolist()
    v = edges['id2'].values.tolist()
    
    edge_index=[u,v]
    
    x = torch.Tensor(x)
    y = torch.LongTensor(y)
    edge_index = torch.LongTensor(edge_index)
    
    N = x.shape[0]
    indexs = list(range(N))
    train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
    val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)

#     train_index, test_index = train_test_split(indexs, test_size=val+test)
#     val_index, test_index = train_test_split(test_index, test_size=test/(val+test))
    
    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    
    data = Data(edge_index=edge_index, 
                x=x,
                y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
    
    return data

#get_film('/scratch/gilbreth/das90/Dataset/heterophily/','film')

In [6]:
def train_val_test_mask(data, train=0.6, val=0.2, test=0.2, random_state=False):
    
    if isinstance(data.x, SparseTensor):
        N = data.x.size(0)
        data.num_nodes = N
    else:
        N = data.x.shape[0]
    
    indexs = list(range(N))
    
    if random_state:
        train_index, test_index = train_test_split(indexs, test_size=val+test)
        val_index, test_index = train_test_split(test_index, test_size=test/(val+test))
    else:        
        train_index, test_index = train_test_split(indexs, test_size=val+test, random_state=1)
        val_index, test_index = train_test_split(test_index, test_size=test/(val+test), random_state=1)



    train_mask = torch.zeros(N, dtype=bool)
    train_mask[train_index]=True    
    val_mask = torch.zeros(N, dtype=bool)
    val_mask[val_index]=True
    test_mask = torch.zeros(N, dtype=bool)
    test_mask[test_index]=True

    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask
    
    return data

In [7]:
class HeteroDataset(Dataset):
    def __init__(self, root, dataset_name, train=0.6, val=0.2, test=0.2,
                 transform=None, pre_transform=None, pre_filter=None):
        super().__init__(None, transform, pre_transform, pre_filter)
        
        self.root = root
        self.dataset_name=dataset_name
        self.degree=degree
        self.train=train
        self.val=val
        self.test=test
        
        if dataset_name == 'film':
            self.data = get_film(root,dataset_name, train, val, test)
        else:
            self.data = get_heterophily(root,dataset_name, train, val, test)

    @property
    def processed_file_names(self):
        return self.dataset_name
    
    @property
    def processed_dir(self):
        return self.root

    @property
    def num_node_features(self):
        return self.data.x.shape[1]
    
    @property
    def num_classes(self):
        return max(self.data.y).item()+1

    def len(self):
        return 1

    def get(self, idx):
        
        return self.data
    
# dataset = HeteroDataset('/scratch/gilbreth/das90/Dataset/heterophily/','texas')
# data = dataset[0]

In [8]:
class LINKXpyg2(Dataset):
    def __init__(self, root, dataset_name, train=0.6, val=0.2, test=0.2,
                 transform=None, pre_transform=None, pre_filter=None, random_state=False):
        super().__init__(None, transform, pre_transform, pre_filter)
        
        self.root = root
        self.dataset_name=dataset_name
        self.degree=degree
        self.train=train
        self.val=val
        self.test=test
        
        FolderName = root+'/LINKXdataset/'+dataset_name+'/'

        data = Data()

        data.x = torch.load(FolderName+'x.pt')
        data.edge_index =torch.load(FolderName+'edge_index.pt')
        data.y = torch.load(FolderName+'y.pt')
    
        self.data = train_val_test_mask(data, train=train, val=val, test=test, random_state=random_state)
        
    @property
    def processed_file_names(self):
        return self.dataset_name
    
    @property
    def processed_dir(self):
        return self.root+'/LINKXdataset/'+self.dataset_name

    @property
    def num_node_features(self):
        return self.data.x.shape[1]
    
    @property
    def num_classes(self):
        return max(self.data.y).item()+1

    def len(self):
        return 1

    def get(self, idx):
        
        return self.data
    
# #'pokec', 'arxiv-year', 'snap-patents', 'twitch-gamer'
# dataset = LINKXpyg2('/scratch/gilbreth/das90/Dataset/','pokec', random_state=0)
# data = dataset[0]
# data

In [9]:
class OGB_MAGcustom(Dataset):
    def __init__(self, root, dataset_name, data, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(None, transform, pre_transform, pre_filter)
        
        self.dataset_name=dataset_name
        
        self.FolderName = root
        self.data = data
        
    @property
    def processed_file_names(self):
        return self.dataset_name
    
    @property
    def processed_dir(self):
        return self.root+self.dataset_name

    @property
    def num_node_features(self):
        return self.data.x.shape[1]
    
    @property
    def num_classes(self):
        return max(self.data.y).item()+1

    def len(self):
        return 1

    def get(self, idx):        
        return self.data
    
# #'pokec', 'arxiv-year', 'snap-patents', 'twitch-gamer'
# dataset = LINKXpyg2('/scratch/gilbreth/das90/Dataset/','pokec', random_state=0)
# data = dataset[0]
# data

In [10]:
# DIR='/scratch/gilbreth/das90/Dataset/'
# dataset = Reddit(root=DIR+'Reddit')
# data = dataset[0]    

In [11]:
def get_data(DATASET_NAME='Cora', 
             DIR=None, params=None, 
             train=None, random_state=False, log=True, h_score=False, split_no=0):
    
    if DIR is not None:
        if log: print('Looking at: ',DIR)    
    elif os.uname()[1].find('gilbreth')==0: ##if not darwin(mac/locallaptop)
        DIR='/scratch/gilbreth/das90/Dataset/'
    elif os.uname()[1].find('unimodular')==0:
        DIR='/scratch2/das90/Dataset/'
    else:
        DIR='./Dataset/'

    Path(DIR).mkdir(parents=True, exist_ok=True)

    RESULTS_DIR=DIR+'RESULTS/'
    Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)

    if log:
        print("Data directory: ", DIR)
        print("Result directory:", RESULTS_DIR)
    
    from torch_geometric.datasets import Planetoid,  KarateClub, CitationFull
    from torch_geometric.transforms import NormalizeFeatures
    from torch_geometric.datasets import Reddit, Reddit2
    
    #DATASET_NAME='Cora' #"Cora", "CiteSeer", "PubMed"

    if DATASET_NAME in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(root=DIR+'Planetoid', name=DATASET_NAME, transform=NormalizeFeatures())
    
    elif DATASET_NAME in ['cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed']:
        #['cora', 'cora_ml', 'citeseer', 'dblp', 'pubmed']
        dataset = CitationFull(root=DIR+'Citation', name=DATASET_NAME, transform=NormalizeFeatures())
    
    elif DATASET_NAME == "Reddit2":
        from ipynb.fs.full.RedditTwo import Reddit2
        #dataset = Reddit2(root=DIR+'Reddit2', transform=NormalizeFeatures())
        dataset = Reddit2(root=DIR+'Reddit2')
        
        
    elif DATASET_NAME == "Reddit":
        #dataset = Reddit(root=DIR+'Reddit', transform=NormalizeFeatures())
        dataset = Reddit(root=DIR+'Reddit')    
        
    elif DATASET_NAME in ["RedditSynthetic", "Reddit0.125","Reddit0.225","Reddit0.325","Reddit0.425","Reddit0.525",
                          "Reddit0.625","Reddit0.725","Reddit0.825","Reddit0.925"]:
        
        h = k = 0
        
        if DATASET_NAME == "RedditSynthetic":
            h = params['h']
            k = params['k']
        
        else:
            k = int(DATASET_NAME[-2:])
            h = float(DATASET_NAME[-5:-2])
            
            #print(h,k)
        
        dataset, data = RedditSynthetic(DIR, h=h, k=k, log=True, recompute=False)
                
    
    elif DATASET_NAME in ['ego-Gplus', 'gemsec-Facebook']:
        from torch_geometric.datasets import SNAPDataset        
        dataset = SNAPDataset(root=DIR+'SNAPDataset', name=DATASET_NAME, transform=NormalizeFeatures())        
        print(dataset)
        
        return dataset
        
        
    
    elif DATASET_NAME in ["BlogCatalog", "PPI", "Facebook", "Twitter", "TWeibo", "MAG"]:
        from torch_geometric.datasets import AttributedGraphDataset
        
#         dataset = AttributedGraphDataset(root=DIR+'/AttributedGraphDataset', name=DATASET_NAME, transform=NormalizeFeatures())
        dataset = AttributedGraphDataset(root=DIR+'/AttributedGraphDataset', name=DATASET_NAME)
        
        print(dataset)
        print(dataset[0])        
        
    elif DATASET_NAME == "AmazonProducts":
        #dataset = AmazonProducts(root=DIR+'AmazonProducts', transform=NormalizeFeatures())
        dataset = AmazonProducts(root=DIR+'AmazonProducts')
        
    elif DATASET_NAME in ['Computers', 'Photo']:
        #dataset = Amazon(root=DIR+'AmazonProducts', transform=NormalizeFeatures())
        dataset = Amazon(root=DIR+'Amazon/', name = DATASET_NAME)        
        

    elif DATASET_NAME in ['CS', 'Physics']:
        dataset = Coauthor(root=DIR+'Coauthor/', name = DATASET_NAME)        
        
    elif DATASET_NAME in ['WikiCS']:
        from torch_geometric.datasets import WikiCS
        dataset = WikiCS(root=DIR+'WikiCS/', is_undirected=False)
        dataset[0].train_mask = None
        dataset[0].test_mask = None
        dataset[0].val_mask = None

    elif DATASET_NAME in ['ogbn-proteins']:
        from ogb.nodeproppred import Evaluator, PygNodePropPredDataset
        from torch_geometric.utils import scatter

        dataset = PygNodePropPredDataset('ogbn-proteins', root=DIR+'/ogbn-proteins')        
        
    elif DATASET_NAME == "Moon":
        dataset = MoonGraph.MoonDataset(n_samples=100, degree=5, train=0.5)    
        G, data =dataset[0]
    
    elif DATASET_NAME == "karate":
        dataset = KarateClub()        
        data = dataset[0]
        data.val_mask = ~data.train_mask
        data.test_mask = data.val_mask
    
    elif DATASET_NAME == "Fake":
        dataset = FakeDataset(num_graphs = 1, 
                              avg_num_nodes = 2000, 
                              avg_degree = 10, 
                              num_channels = 64, 
                              edge_dim = 0, 
                              num_classes = 10, 
                              task = 'auto', 
                              is_undirected = True,                               
                              transform=NormalizeFeatures())
        
    elif DATASET_NAME == "OGB_MAG":
        #dataset = OGB_MAG(root=DIR+'OGB_MAG', preprocess='metapath2vec', transform=NormalizeFeatures())
        dataset = OGB_MAG(root=DIR+'OGB_MAG3', preprocess='metapath2vec')
        
        data = dataset[0]                
        #print(dataset, data, data['paper'])        
        data = Data(x=data['paper'].x, edge_index=data['paper', 'cites', 'paper'].edge_index,
                   train_mask=data['paper'].train_mask,val_mask = data['paper'].val_mask,test_mask = data['paper'].test_mask,y= data['paper'].y,)
        
        dataset = OGB_MAGcustom(DIR+'OGB_MAG3','OGB_MAG',data)
        
        #return dataset
        
    elif DATASET_NAME == "Flickr":
        dataset = Flickr(root=DIR+'Flickr')
    
    elif DATASET_NAME == "Yelp":
        dataset = Yelp(root=DIR+'Yelp')
    
    elif DATASET_NAME == "PPI":
        
        dataset = PPI(root=DIR+'PPI')
        
        return dataset
    
    ###heterophilic dataset
    #https://github.com/pyg-team/pytorch_geometric/blob/master/examples/linkx.py
    elif DATASET_NAME in ['pokec', 'arxiv-year', 'snap-patents', 'twitch-gamer','wiki']:
        dataset = LINKXpyg2(DIR, DATASET_NAME, random_state=random_state)
    
    elif DATASET_NAME in ["penn94", "reed98", "amherst41", "cornell5", "johnshopkins55", "genius"]:
        from ipynb.fs.full.HeterophilousDataset import LINKXDataset
        
        dataset = LINKXDataset(root = DIR+'/Heterophilic/', name = DATASET_NAME)    
        #transform=NormalizeFeatures())
        
    elif DATASET_NAME in ["Roman-empire", "Amazon-ratings", "Minesweeper", "Tolokers", "Questions"]:
        from ipynb.fs.full.HeterophilousDataset import HeterophilousGraphDataset
        
        dataset = HeterophilousGraphDataset(DIR+'/Heterophilic/', DATASET_NAME)    
        #transform=NormalizeFeatures())
    elif DATASET_NAME == 'Actor':
        from ipynb.fs.full.HeterophilousDataset import Actor
        
        dataset = Actor(root=DIR+'/Heterophilic/Actor')
        #transform=NormalizeFeatures())
    
    elif DATASET_NAME in ["Cornell", "Texas", "Wisconsin"]:
        
        
        
        from ipynb.fs.full.HeterophilousDataset import WebKB
        
        dataset = WebKB(root = DIR+'/Heterophilic/', name = DATASET_NAME)
        #transform=NormalizeFeatures())
        
    elif DATASET_NAME in ["Chameleon", "Crocodile", "Squirrel"]:
        from ipynb.fs.full.HeterophilousDataset import WikipediaNetwork
        
        if DATASET_NAME == 'Crocodile':
            dataset = WikipediaNetwork(DIR+'/Heterophilic/', DATASET_NAME.lower(), geom_gcn_preprocess= False)
        else:        
            dataset = WikipediaNetwork(DIR+'/Heterophilic/', DATASET_NAME.lower())
            #transform=NormalizeFeatures())
    
    #implemented heterophily
    elif DATASET_NAME in ['chameleon','cornell','film', 'squirrel', 'texas','wisconsin']:
        dataset = HeteroDataset(DIR+'/heterophily/', DATASET_NAME)       
    
    else: 
        return None, None
        raise Exception('dataset not found')

    if DATASET_NAME in ['Moon', 'karate','OGB_MAG'] or DATASET_NAME[:7]=='Reddit0' or DATASET_NAME in ['RedditSynthetic']:
        #MoonGraph.draw_blobs_data(G, data)        
        None
    
    elif DATASET_NAME=='ogbn-proteins':
        # splitted_idx = dataset.get_idx_split()
        data = dataset[0]
        data.node_species = None
        data.y = data.y.to(torch.float)

        # Initialize features of nodes by aggregating edge features.
        row, col = data.edge_index
        data.x = scatter(data.edge_attr, col, dim_size=data.num_nodes, reduce='sum')

        # # Set split indices to masks.
        # for split in ['train', 'valid', 'test']:
        #     mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        #     mask[splitted_idx[split]] = True
        #     data[f'{split}_mask'] = mask
        
        data = train_val_test_mask(data, train=0.6, val=0.2, test=0.2, random_state=random_state)
    
    elif DATASET_NAME == 'Reddit2':
        
        d = dataset[0][0]
        data = Data(x = d['x'], y=d['y'], 
                    edge_index = d['edge_index'],
                    train_mask=d['train_mask'],
                    val_mask=d['val_mask'],
                    test_mask = d['test_mask'])
        
    
    else:
        data = dataset[0]  # Get the first graph object.
        if 'train_mask' not in data:
            data = train_val_test_mask(data, train=0.6, val=0.2, test=0.2, random_state=random_state)
            
        elif data.train_mask.dim()>1 and data.val_mask.dim()>1 and data.test_mask.dim()>1:
            
            if data.train_mask.shape[1]>split_no:            
                data.train_mask = data.train_mask[:,split_no]
                data.val_mask = data.val_mask[:,split_no]
                data.test_mask = data.test_mask[:,split_no]
            else:
                data = train_val_test_mask(data, train=0.6, val=0.2, test=0.2, random_state=random_state)
            
        
    if train is not None:
        val = (1-train)/2.0
        data = train_val_test_mask(data, train=train, val=val, test=1-(train+val), random_state=random_state)
        
    if log:
        print()
        print(f'Dataset: {dataset}:')
        print('======================')
        print(f'Number of graphs: {len(dataset)}')
        print(f'Number of features: {dataset.num_features}')
        print(f'Number of classes: {dataset.num_classes}')
        print()
        print(data)
        print('===========================================================================================================')

        # Gather some statistics about the graph.
        print(f'Number of nodes: {data.num_nodes}')
        print(f'Number of edges: {data.num_edges}')
        print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
        print(f'Number of training nodes: {data.train_mask.sum()}')
        print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
        print(f'Has isolated nodes: {data.has_isolated_nodes()}')
        print(f'Has self-loops: {data.has_self_loops()}')
        print(f'Is undirected: {data.is_undirected()}')
    
    if len(data.y.shape) == 1:
        labels = data.y
    else:
        if log: print("Testing homophily by converting multi-label to one-label")
        labels = data.y.argmax(dim=1)
        data.y = labels
    
    if torch.min(data.y)<0:
        if log: print("Shifting label to non-negative")
        data.y = data.y-torch.min(data.y)
    
    
    if h_score:
        print("N ",data.num_nodes, " E ",data.num_edges," d ",data.num_edges / data.num_nodes, end=' ')
        
        print(homophily(data.edge_index, labels, method='node'),homophily(data.edge_index, labels, method='edge'), end=' ')
        
        try:
            esen = homophily(data.edge_index, labels, method='edge_insensitive')
        except:
            esen = -1        
        print(esen, end=' ')            
        print(assortativity(data.edge_index), end=' ')
        
        
    return data, dataset

In [12]:
def RedditSynthetic(DIR, h=0.5, k=25, log=False, recompute=False):
    
    file_path = DIR+'RedditSynthetic/Reddit'+str(h)+str(k)    
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    dataset = Reddit(root=DIR+'Reddit')
    
    if os.path.isfile(file_path) and recompute==False:
        data = torch.load(file_path)  
        
        if log:
            print("loaded from: ",file_path)
        
        return dataset, data
    
    data = dataset[0]
    
    if log:
        print(data)
        
    num_class = max(data.y)+1
    
    E = data.num_edges
    N = data.num_nodes
    
    adj = SparseTensor(
        row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))
    
    edge_numbers = []
    
    if log:
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')
    
    
    for i in range(N):
        row, col, edge = adj[i,:].coo()      
        
        if len(col) == 0: 
            if log:
                pbar.update(1)
            continue
        y_current = data.y[i].item()
        
#         print(y_current)
#         print(row, col, edge)
        
        
        match_indexes = ((data.y[col] == y_current).nonzero()).view(-1).numpy()
        other_indexes = ((data.y[col] != y_current).nonzero()).view(-1).numpy()
        
#         print(len(match_indexes),match_indexes)
#         print(len(other_indexes),other_indexes)
        
    
        select=int(len(col)*k/100) #select k percent of nodes
        h_select = min(int(select*h),len(match_indexes)) #select h homophilic nodes
        o_select = min(select-h_select, len(other_indexes))
        
#         print(h_select, len(match_indexes))
#         print(o_select, len(other_indexes))
#         print("*"*100)
        
        samples1 = np.random.choice(match_indexes, h_select, replace=False)
        samples2 = np.random.choice(other_indexes, o_select, replace=False)
                            
        edge_numbers.append(edge[samples1])
        edge_numbers.append(edge[samples2])
        
        if log:
            pbar.update(1)
    if log:        
        pbar.close()
            
    edge_numbers = torch.cat(edge_numbers)    
    edge_index = data.edge_index[:,edge_numbers]
    
    if log:
        print(f'Average node degree: {edge_index.shape[1] / N:.2f}')    
        print(homophily(edge_index, data.y, method='node'),homophily(edge_index, data.y, method='edge'), end=' ')
    
    data.edge_index = edge_index
    
    torch.save(data, file_path)

    if log:
        print("Data saved to file: ",file_path)
        
    return dataset, data

# RedditSynthetic(DIR, h=0.5, k=25, log=True, recompute=False)

# for h in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
#     RedditSynthetic(DIR, h=h, k=25, log=True, recompute=True)

In [13]:
# from torch_geometric.datasets import SNAPDataset
# DIR='/scratch/gilbreth/das90/Dataset/'

In [14]:
new_datasets = [
    "Texas",
    "Cora",
    "Reddit",
    "AmazonProducts",
    
    "Photo",
    "WikiCS",
    "Reddit2",
    #"Amazon2M",
    
    "Squirrel",
    "penn94",
    "ogbn-proteins",
    "pokec"
]

In [15]:
if __name__ == '__main__':   
    data, dataset = get_data('karate', log=True, h_score = True, split_no = 0)

    None

Data directory:  /scratch/gilbreth/das90/Dataset/
Result directory: /scratch/gilbreth/das90/Dataset/RESULTS/

Dataset: KarateClub():
Number of graphs: 1
Number of features: 34
Number of classes: 4

Data(x=[34, 34], edge_index=[2, 156], y=[34], train_mask=[34], val_mask=[34], test_mask=[34])
Number of nodes: 34
Number of edges: 156
Average node degree: 4.59
Number of training nodes: 4
Training node label rate: 0.12
Has isolated nodes: False
Has self-loops: False
Is undirected: True
N  34  E  156  d  4.588235294117647 0.8020520210266113 0.7564102411270142 0.6170591711997986 -0.4756128787994385 

In [16]:
# data, dataset = get_data('ogbn-proteins', log=True, h_score = True, split_no = 0)

## Dataset Stats

In [17]:
datasets = [
    "Cornell",
    "Texas",
    "Wisconsin",
    "reed98",
    "amherst41",
    "penn94",
    "Roman-empire",
    "cornell5",
    "Squirrel",
    "johnshopkins55",
#     "AmazonProducts", #error
    "Actor",
    "Minesweeper",
    "Questions",
    "Chameleon",
    "Tolokers",
#     "Flickr", #error
#     "Yelp", #error
    "Amazon-ratings",
    "genius",
    "cora",
    "CiteSeer",
    "dblp",
    "Computers",
    "pubmed",
    "Reddit",
    "cora_ml",
    "Cora",
#     "Reddit2", #error
    "CS",
    "Photo",
    "Physics",
    "citeseer",    
    'pokec',
    'arxiv-year',
#     'snap-patents', #error
#     'twitch-gamer', #error
#     'wiki', #error
    
]

In [18]:
def adj_homophily(data,num_classes,log=True):
    h_edge = homophily(data.edge_index, data.y, method='edge')
    h_adj = -1
    E = data.num_edges
    N = data.num_nodes
    
    adj = SparseTensor(
        row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))
    
    class_degree=np.zeros(num_classes)
    
    if log:
        pbar = tqdm(total=N)
        pbar.set_description(f'Nodes')
        
    for i in range(N):
        row, col, edge = adj[i,:].coo()      
        
        if len(col) == 0: 
            if log:
                pbar.update(1)
            continue
        
        y_current = data.y[i].item()
        class_degree[y_current] += len(col)        
        
        if log:
            pbar.update(1)
    if log:        
        pbar.close()
        
    D_k = np.sum(class_degree**2)/E**2
    
    #print(D_k)
    
    h_adj = (h_edge - D_k)/(1-D_k)
    
    return h_adj

# data, dataset = get_data('Tolokers', log=False, h_score = False)
# adj_homophily(data, dataset.num_classes, log=False)

In [19]:
def dataset_properties():
    
    for i, dataset in enumerate(new_datasets):
        print(dataset, end='\t')
#         print(i,"\t",dataset)
        data, dataset = get_data(dataset, log=False, h_score = False)
                
        if len(data.y.shape) > 1:
            data.y = data.y.argmax(dim=1)        
            num_classes = torch.max(data.y).item()+1
        else:
            #num_classes = dataset.num_classes      
            num_classes = torch.max(data.y).item()+1
        
        
        tr = int(data.train_mask.sum()) / data.num_nodes
        va = int(data.val_mask.sum()) / data.num_nodes
        te = int(data.test_mask.sum()) / data.num_nodes
        
        print(f'tr/va/te {tr:0.2f}/{va:0.2f}/{te:0.2f}', end=' ')


        f = dataset.num_features
        c = num_classes
        i = "Yes" if data.has_isolated_nodes() else "No"
        sl = "Yes" if data.has_self_loops() else "No"
        direc = "Yes" if data.is_undirected() else "No"
        print(f, " ", c, " ", i, " ",sl, " ", direc, end=' ')
        
        h_adj = adj_homophily(data, num_classes, log=False)        
        print('h_adj ',h_adj)
        
    return 
    
    
# dataset_properties()

# Plot homophily Distribution

In [20]:
def hp_compute(data):
    
    N = data.num_nodes
    E = data.num_edges
    
    adj = SparseTensor(
        row=data.edge_index[0], col=data.edge_index[1],
        value=torch.arange(E, device=data.edge_index.device),
        sparse_sizes=(N, N))
    
    hp_data=np.zeros(N)
    
    pbar = tqdm(total=N)
    pbar.set_description(f'Nodes')
        
    for i in range(N):
        row, col, edge = adj[i,:].coo()      
        
        if len(col) == 0: 
            pbar.update(1)
            continue
        
        y_current = data.y[i]
        y_neighbors = data.y[col]
        
        match  = (y_neighbors==y_current).type(torch.int).sum()
        
        hp_data[i] = match.item()/len(y_neighbors)
        
        #print(y_current, y_neighbors, match, hp_data[i])
        
        pbar.update(1)
            
    pbar.close()
    
    return hp_data

#hp_data = hp_compute(data)

In [21]:
import matplotlib.ticker as ticker
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter, StrMethodFormatter

In [22]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.font_manager as fm
plt.rcParams["font.family"] = 'DeJavu Serif'
plt.rcParams["font.serif"] = ["Times New Roman"]
plt.rcParams["font.size"] = 16

In [23]:
# https://github.com/matplotlib/matplotlib/issues/5862#issuecomment-197330145
def fix_eps(fpath):
    """Fix carriage returns in EPS files caused by Arial font."""
    txt = b""
    with open(fpath, "rb") as f:
        for line in f:
            if b"\r\rHebrew" in line:
                line = line.replace(b"\r\rHebrew", b"Hebrew")
            txt += line
    with open(fpath, "wb") as f:
        f.write(txt)
            
def pd_hist(data, DATASET_NAME=''):
    
#     plt.rcParams["font.family"] = "serif"
#     plt.rcParams["font.serif"] = ["Times New Roman"]
    
    width = 5
    font_size = 16
    
    
    plt.rc('font', size=font_size)          # controls default text sizes
    plt.rc('axes', titlesize=font_size)     # fontsize of the axes title
    plt.rc('axes', labelsize=font_size)     # fontsize of the x and y labels
    plt.rc('xtick', labelsize=font_size)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=font_size)    # fontsize of the tick labels
    plt.rc('legend', fontsize=font_size)    # legend fontsize


    fig = plt.figure(figsize = (5, 5), dpi=150);
    ax = plt.gca();
    ax.set_aspect('auto')
    fig.canvas.draw();      
    
    # Generate some random data
    #data = np.random.normal(size=1000)
    # Calculate the probability density function
    density, bins, _ = plt.hist(data, density=False, bins=25)

    # Plot the probability density function
    plt.plot(bins[:-1], density)
    
    font = {'fontname':'Times New Roman', 'size':font_size}
    
    #ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))

    # Add labels and title
    plt.xlabel('Local Node homophily')
    plt.ylabel('Number of Nodes')
    #plt.title('Histogram of node homophily values'+': '+DATASET_NAME)
#     plt.ticklabel_format(axis='y', style='sci', scilimits=(0,1),useMathText=True)

    class MathTextSciFormatter(ticker.Formatter):
        def __init__(self, fmt="%1.1e"):
            self.fmt = fmt
        def __call__(self, x, pos=None):
            s = self.fmt % x
            decimal_point = '.'
            positive_sign = '+'
            tup = s.split('e')
            significand = tup[0].rstrip(decimal_point)
            sign = tup[1][0].replace(positive_sign, '')
            exponent = tup[1][1:].lstrip('0')
            if exponent:
                exponent = '10^{%s%s}' % (sign, exponent)
            if significand and exponent:
                s =  r'%s{\times}%s' % (significand, exponent)
            else:
                s =  r'%s%s' % (significand, exponent)
            return "${}$".format(s)

    plt.gca().yaxis.set_major_formatter(MathTextSciFormatter("%1.1e"))


    filename="Plots/homophily_"+DATASET_NAME
    
    # Show the plot
    plt.show();
    fig.savefig(filename + '.pdf', format = 'pdf', bbox_inches='tight');
    fig.savefig(filename + '.eps', format = 'eps', bbox_inches='tight', dpi = fig.dpi);
    fix_eps(filename + '.eps');
    
    return 

# hp_data = [0.01, 0.1,0.6]
# pd_hist(hp_data, "test")

In [24]:
# # datasets = ['Cora', 'karate']
# for DATASET_NAME in datasets:
#     data, dataset = get_data(DATASET_NAME, log=False, h_score = False)
#     hp_data = hp_compute(data)
#     pd_hist(hp_data, DATASET_NAME)

## Generate Synthetic

In [25]:
def balance_class(data):
    
    unique_elements, counts = torch.unique(data.y, return_counts=True)
#     print(unique_elements)
#     print(counts)
    mincount = min(counts).item()
#     print(mincount)

    subset = []

    for i in unique_elements:
        indexes = ((data.y == i).nonzero()).view(-1).numpy()
#         print(len(indexes))
#         print(indexes)
        samples = np.random.choice(indexes, mincount, replace=False)
        subset.extend(samples)

#     print(len(subset), mincount*num_classes)

    node_idx = torch.tensor(subset)
    edge_index = subgraph(node_idx, data.edge_index)[0]
    
#     print(node_idx, edge_index)
    
    N = data.num_nodes
    E = data.num_edges

    data.num_nodes = node_idx.size(0)
    data.edge_index = edge_index

    for key, item in data:
        if key in ['edge_index', 'num_nodes']:
            continue
        if isinstance(item, torch.Tensor) and item.size(0) == N:
            data[key] = item[node_idx]
        elif isinstance(item, torch.Tensor) and item.size(0) == E:
            data[key] = item[edge_idx]
        else:
            data[key] = item
    
    return data

# print(data.edge_index)
# data = balance_class(data)
# print(data.edge_index)

In [36]:
def generate_synthetic(data, d=5, h=0.8, train=0.6, random_state=None, log=True, balance = False):
    
    if balance:
        data = balance_class(data)
        
    num_class = max(data.y)+1
    cluster_vertices = {}
    for c in range(num_class):
        indices = torch.where(data.y == c)[0]
        cluster_vertices[c]=indices
    
    n = data.num_nodes
    
#     intra_d = np.random.multinomial(n*d*h, np.ones(n)/n, size=1)[0]
#     inter_d = np.random.multinomial(n*d*(1-h), np.ones(n)/n, size=1)[0]
    
    intra_d = np.round(np.ones(n)*(d*h)).astype(int)
    inter_d = np.round(np.ones(n)*(d*(1-h))).astype(int)
    
#     print(intra_d, inter_d)
    
    edge_index = [[],[]]
    
    for c in range(num_class):
        intra_vertices = cluster_vertices[c]
        inter_vertices = torch.cat([value for key, value in cluster_vertices.items() if key!=c])
        
        intra_vertices = intra_vertices.numpy()
        inter_vertices = inter_vertices.numpy()
        
#         print('Class:', c)
#         print(intra_vertices)
#         print(inter_vertices)
        
        for u in intra_vertices:
            
            ## remove self-loop
            #intra_vertices_u = 
            
            intra_v = np.random.choice(intra_vertices, min(len(intra_vertices),intra_d[u]), replace=False)
            inter_v = np.random.choice(inter_vertices, min(len(inter_vertices),inter_d[u]), replace=False)
            
            Vs = np.append(intra_v,inter_v)
            Us = np.repeat(u,len(Vs))
            
            unique_elements, counts = np.unique(inter_v, return_counts=True)
            
#             print("-"*50)
#             print(u)
#             print(Vs)
#             print(unique_elements)
#             print(counts)
#             print("-"*50)
            
#             if len(unique_elements)< (num_class-1):
#                 print('un du toa:')
#                 print(unique_elements)
#                 print(counts)
            
            edge_index[0].extend(Us)
            edge_index[1].extend(Vs)
             
#             edge_index[1].extend(Us)
#             edge_index[0].extend(Vs)
    
    data.edge_index = torch.LongTensor(edge_index)
    
    if train is not None:
        val = (1-train)/2.0
        data = train_val_test_mask(data, train=train, val=val, test=1-(train+val), random_state=random_state)
    
    if log:
        print(f"d={data.num_edges/data.num_nodes:0.4f}",end=' ')
        print(f"Hn={homophily(data.edge_index, data.y, method='node'):0.4f}",end=' ')
        print(f"He={homophily(data.edge_index, data.y, method='edge'):0.4f}",end=' ')
        print(f"Hin={homophily(data.edge_index, data.y, method='edge_insensitive'):0.4f}",end=' ')
        print(f"a={assortativity(data.edge_index):0.4f}", end=' ')
        
        unique_elements, counts = torch.unique(data.y, return_counts=True)
        print("c=",unique_elements.tolist(), counts.tolist(), end=' ')

        print(f'#c={int(data.train_mask.sum()) / data.num_nodes:.4f}', end=' ')
    
    return data


# data, dataset = get_data('Cora', log=False, h_score = True, split_no = 0)
# data = generate_synthetic(data, d=10, h=0.0, train=0.2, random_state=None, log=True, balance = True)
# data

N  2708  E  10556  d  3.8980797636632203 0.825157880783081 0.8099659085273743 0.7657181620597839 -0.06587088108062744 d=10.0000 Hn=0.0000 He=0.0000 Hin=0.0000 a=inf c= [0, 1, 2, 3, 4, 5, 6] [180, 180, 180, 180, 180, 180, 180] #c=0.2000 

Data(x=[1260, 1433], edge_index=[2, 12600], y=[1260], train_mask=[1260], val_mask=[1260], test_mask=[1260], num_nodes=1260)

## Small Synthetic

In [27]:
# x = torch.Tensor([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
# y = torch.LongTensor([0,0,0, 1, 1, 1, 1])
# edge_index = torch.LongTensor([[1,2],[1,4],[1,5],[2,1],[3,6],[3,7],[4,5],[4,1],[4,6],[4,7],[5,1],[5,4],[5,6],[6,3],[6,4],[6,5],[6,7],[7,3],[7,4],[7,6]]).T
# edge_index = edge_index-1
# data = Data(x=x, y=y, edge_index = edge_index)
# draw_graph(edge_index, y, 7)