## Implementing our new load_data function to work within the new Pyro implementation

In [1]:
import pickle as pkl
import numpy as np
import scipy.sparse as sp
import torch
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import matplotlib.pyplot as plt

In [2]:
### Auxiliary function
def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def old_load_data(dataset):
    # load the data: x, tx, allx, graph
    names = ['x', 'tx', 'allx', 'graph']
    objects = []
    for i in range(len(names)): 

        # Fixed pkl compatability using with open 
        with open("../data/paperData/ind.{}.{}".format(dataset, names[i]), 'rb') as rf:
            u = pkl._Unpickler(rf)
            u.encoding = 'latin1'
            cur_data = u.load()
            objects.append(cur_data)
    x, tx, allx, graph = tuple(objects)
    
    test_idx_reorder = parse_index_file(
        "../data/paperData/ind.{}.test.index".format(dataset))
    test_idx_range = np.sort(test_idx_reorder)  


    if dataset == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(
            min(test_idx_reorder), max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended
        print(tx.shape)
        print(allx.shape)

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    
    features = torch.FloatTensor(np.array(features.todense()))
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    return adj, features

oldadj, oldfeatures = old_load_data('cora')

In [3]:
print('Old adj     : ', oldadj.shape, type(oldadj))
print('Old features: ', oldfeatures, oldfeatures.shape, type(oldfeatures))

Old adj     :  (2708, 2708) <class 'scipy.sparse.csr.csr_matrix'>
Old features:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) torch.Size([2708, 1433]) <class 'torch.Tensor'>


In [4]:
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MNISTSuperpixels, Planetoid
import torch_geometric.transforms as T
from collections import defaultdict

In [5]:
def new_load_data(dataset):
    planetoids = ['CORA', 'CITESEER', 'PUBMED']

    if dataset.upper() in planetoids:
        path = '../data/geometric/' + dataset.upper()
        train_loader = Planetoid(path, dataset)
        df = train_loader[0]

    allx = df['x'][~df.test_mask]
    tx = df['x'][df.test_mask]    
    print(df['x'])
#     if dataset.upper() == 'CITESEER':
#         # Fix citeseer dataset (there are some isolated nodes in the graph)
#         # Find isolated nodes, add them as zero-vecs into the right position
#         test_idx_range = np.array(np.nonzero(df['test_mask'])).flatten()
#         test_idx_range_full = range(
#             min(test_idx_range), max(test_idx_range) + 1)
        
#         tx_extended = sp.lil_matrix((len(test_idx_range_full), df['x'].shape[1]))
        
#         tx_extended[test_idx_range - min(test_idx_range), :] = tx
#         tx = tx_extended
    features = sp.vstack((allx, tx)).tolil()
    features = torch.FloatTensor(np.array(features.todense()))
    
    edgeList = np.array(df['edge_index'].transpose(1, 0))
    edgeList = list(map(tuple, edgeList))

    d = defaultdict(list)
    for k, v in edgeList:
        d[k].append(v)

    adj = nx.adjacency_matrix(nx.from_dict_of_lists(d))

    return adj, features


In [6]:
adj, features = new_load_data('cora')

print('New adj     : ', adj.shape, type(adj))
print('New features: ', features, features.shape, type(features))

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
New adj     :  (2708, 2708) <class 'scipy.sparse.csr.csr_matrix'>
New features:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) torch.Size([2708, 1433]) <class 'torch.Tensor'>


In [7]:
# print(np.array(features))
print(np.array_equal(np.array(oldfeatures), np.array(features)))
np.array_equal(np.array(adj.todense()), np.array(oldadj.todense()))

True


True

In [8]:
### Test mask_test_edges function, possibly implement with masking instead

In [9]:
G = nx.from_scipy_sparse_matrix(adj)