## Implementing our new load_data function to work within the new Pyro implementation

In [1]:
import pickle as pkl
import numpy as np
import scipy.sparse as sp
import torch
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import matplotlib.pyplot as plt

In [2]:
### Auxiliary function
def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def old_load_data(dataset):
    # load the data: x, tx, allx, graph
    names = ['x', 'tx', 'allx', 'graph']
    objects = []
    for i in range(len(names)): 

        # Fixed pkl compatability using with open 
        with open("../data/paperData/ind.{}.{}".format(dataset, names[i]), 'rb') as rf:
            u = pkl._Unpickler(rf)
            u.encoding = 'latin1'
            cur_data = u.load()
            objects.append(cur_data)
    x, tx, allx, graph = tuple(objects)
    test_idx_reorder = parse_index_file(
        "../data/paperData/ind.{}.test.index".format(dataset))
    test_idx_range = np.sort(test_idx_reorder)    
    
    if dataset == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(
            min(test_idx_reorder), max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    
    features = torch.FloatTensor(np.array(features.todense()))
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    return adj, features

oldadj, oldfeatures = old_load_data('cora')

In [3]:
print('Old adj     : ', oldadj.shape, type(oldadj))
print('Old features: ', oldfeatures, oldfeatures.shape, type(oldfeatures))

Old adj     :  (2708, 2708) <class 'scipy.sparse.csr.csr_matrix'>
Old features:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) torch.Size([2708, 1433]) <class 'torch.Tensor'>


In [4]:
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MNISTSuperpixels, Planetoid
import torch_geometric.transforms as T
from collections import defaultdict

In [5]:
def new_load_data(dataset):
    if dataset == 'cora':
        path = '../data/geometric/CORA'
        train_loader = Planetoid(path, dataset)
        
        dataset = train_loader[0]
        
#         test_idx_range = np.nonzero(dataset['test_mask'])        
        
        allx = dataset['x'][~dataset.test_mask]
        tx = dataset['x'][dataset.test_mask]
        features = sp.vstack((allx, tx)).tolil()
        features = torch.FloatTensor(np.array(features.todense()))
        
        edgeList = np.array(dataset['edge_index'].transpose(1, 0))
        edgeList = list(map(tuple, edgeList))
        
        d = defaultdict(list)
        for k, v in edgeList:
            d[k].append(v)
            
        
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(d))
        
        return adj, features


In [6]:
adj, features = new_load_data('cora')

print('New adj     : ', adj.shape, type(adj))
print('New features: ', features, features.shape, type(features))

New adj     :  (2708, 2708) <class 'scipy.sparse.csr.csr_matrix'>
New features:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) torch.Size([2708, 1433]) <class 'torch.Tensor'>


In [7]:
# # Testing  test_idx_range = np.sort(test_idx_reorder)
# path = '../data/geometric/CORA'
# train_loader = Planetoid(path, 'cora')

# dataset = train_loader[0]

# print(np.nonzero(dataset['test_mask']))

In [8]:
### Test mask_test_edges function, possibly implement with masking instead

In [9]:
# print(np.array(features))
print(np.array_equal(np.array(oldfeatures), np.array(features)))

True


In [10]:
np.array_equal(np.array(adj.todense()), np.array(oldadj.todense()))

True

In [13]:
print(adj)

  (0, 633)	1
  (0, 1862)	1
  (0, 2582)	1
  (1, 2)	1
  (1, 652)	1
  (1, 654)	1
  (2, 1)	1
  (2, 332)	1
  (2, 1454)	1
  (2, 1666)	1
  (2, 1986)	1
  (3, 2544)	1
  (4, 1016)	1
  (4, 1256)	1
  (4, 1761)	1
  (4, 2175)	1
  (4, 2176)	1
  (5, 1629)	1
  (5, 1659)	1
  (5, 2546)	1
  (6, 373)	1
  (6, 1042)	1
  (6, 1416)	1
  (6, 1602)	1
  (7, 208)	1
  :	:
  (2694, 431)	1
  (2694, 2695)	1
  (2695, 431)	1
  (2695, 2694)	1
  (2696, 2615)	1
  (2697, 986)	1
  (2698, 1400)	1
  (2698, 1573)	1
  (2699, 2630)	1
  (2700, 1151)	1
  (2701, 44)	1
  (2701, 2624)	1
  (2702, 186)	1
  (2702, 1536)	1
  (2703, 1298)	1
  (2704, 641)	1
  (2705, 287)	1
  (2706, 165)	1
  (2706, 169)	1
  (2706, 1473)	1
  (2706, 2707)	1
  (2707, 165)	1
  (2707, 598)	1
  (2707, 1473)	1
  (2707, 2706)	1


In [14]:
print(oldadj)

  (0, 633)	1
  (0, 1862)	1
  (0, 2582)	1
  (1, 2)	1
  (1, 652)	1
  (1, 654)	1
  (2, 1)	1
  (2, 332)	1
  (2, 1454)	1
  (2, 1666)	1
  (2, 1986)	1
  (3, 2544)	1
  (4, 1016)	1
  (4, 1256)	1
  (4, 1761)	1
  (4, 2175)	1
  (4, 2176)	1
  (5, 1629)	1
  (5, 1659)	1
  (5, 2546)	1
  (6, 373)	1
  (6, 1042)	1
  (6, 1416)	1
  (6, 1602)	1
  (7, 208)	1
  :	:
  (2694, 431)	1
  (2694, 2695)	1
  (2695, 431)	1
  (2695, 2694)	1
  (2696, 2615)	1
  (2697, 986)	1
  (2698, 1400)	1
  (2698, 1573)	1
  (2699, 2630)	1
  (2700, 1151)	1
  (2701, 44)	1
  (2701, 2624)	1
  (2702, 186)	1
  (2702, 1536)	1
  (2703, 1298)	1
  (2704, 641)	1
  (2705, 287)	1
  (2706, 165)	1
  (2706, 169)	1
  (2706, 1473)	1
  (2706, 2707)	1
  (2707, 165)	1
  (2707, 598)	1
  (2707, 1473)	1
  (2707, 2706)	1
