# Objective

Investigate the graph data loader, in order to better understand the structure of the data. I will do this by copying and pasting the code in `GraphRNN/main.py` and running line by line in this notebook. This will allow me to get a hands-on look at the data structures.

# main

The code from `main.py`:

In [1]:
from train import *

In [2]:
args = Args()
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda)
print('CUDA', args.cuda)
print('File name prefix',args.fname)
# check if necessary directories exist
if not os.path.isdir(args.model_save_path):
    os.makedirs(args.model_save_path)
if not os.path.isdir(args.graph_save_path):
    os.makedirs(args.graph_save_path)
if not os.path.isdir(args.figure_save_path):
    os.makedirs(args.figure_save_path)
if not os.path.isdir(args.timing_save_path):
    os.makedirs(args.timing_save_path)
if not os.path.isdir(args.figure_prediction_save_path):
    os.makedirs(args.figure_prediction_save_path)
if not os.path.isdir(args.nll_save_path):
    os.makedirs(args.nll_save_path)


CUDA 1
File name prefix GraphRNN_RNN_grid_4_128_


In [5]:
time = strftime("%Y-%m-%d %H:%M:%S", gmtime())
# logging.basicConfig(filename='logs/train' + time + '.log', level=logging.DEBUG)
if args.clean_tensorboard:
    if os.path.isdir("tensorboard"):
        shutil.rmtree("tensorboard")
configure("tensorboard/run"+time, flush_secs=5)

In [6]:
graphs = create_graphs.create(args)

In [10]:
# split datasets
random.seed(123)
shuffle(graphs)
graphs_len = len(graphs)
graphs_test = graphs[int(0.8 * graphs_len):]
graphs_train = graphs[0:int(0.8*graphs_len)]
graphs_validate = graphs[0:int(0.2*graphs_len)]

In [11]:
graph_validate_len = 0
for graph in graphs_validate:
    graph_validate_len += graph.number_of_nodes()
graph_validate_len /= len(graphs_validate)
print('graph_validate_len', graph_validate_len)

graph_test_len = 0
for graph in graphs_test:
    graph_test_len += graph.number_of_nodes()
graph_test_len /= len(graphs_test)
print('graph_test_len', graph_test_len)

graph_validate_len 199.5
graph_test_len 215.0


In [15]:
args.max_num_node = max([graphs[i].number_of_nodes() for i in range(len(graphs))])
max_num_edge = max([graphs[i].number_of_edges() for i in range(len(graphs))])
min_num_edge = min([graphs[i].number_of_edges() for i in range(len(graphs))])

# args.max_num_node = 2000
# show graphs statistics
print('total graph num: {}, training set: {}'.format(len(graphs),len(graphs_train)))
print('max number node: {}'.format(args.max_num_node))
print('max/min number edge: {}; {}'.format(max_num_edge,min_num_edge))
print('max previous node: {}'.format(args.max_prev_node))

total graph num: 100, training set: 80
max number node: 361
max/min number edge: 684; 180
max previous node: 40


In [16]:
# save ground truth graphs
## To get train and test set, after loading you need to manually slice
save_graph_list(graphs, args.graph_save_path + args.fname_train + '0.dat')
save_graph_list(graphs, args.graph_save_path + args.fname_test + '0.dat')
print('train and test graphs saved at: ', args.graph_save_path + args.fname_test + '0.dat')

train and test graphs saved at:  ./graphs/GraphRNN_RNN_grid_4_128_test_0.dat


In [17]:
### dataset initialization
if 'nobfs' in args.note:
    print('nobfs')
    dataset = Graph_sequence_sampler_pytorch_nobfs(graphs_train, max_num_node=args.max_num_node)
    args.max_prev_node = args.max_num_node-1
if 'barabasi_noise' in args.graph_type:
    print('barabasi_noise')
    dataset = Graph_sequence_sampler_pytorch_canonical(graphs_train,max_prev_node=args.max_prev_node)
    args.max_prev_node = args.max_num_node - 1
else:
    dataset = Graph_sequence_sampler_pytorch(graphs_train,max_prev_node=args.max_prev_node,max_num_node=args.max_num_node)
sample_strategy = torch.utils.data.sampler.WeightedRandomSampler([1.0 / len(dataset) for i in range(len(dataset))],
                                                                 num_samples=args.batch_size*args.batch_ratio, replacement=True)
dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers,
                                           sampler=sample_strategy)

In [21]:
test_graph = next(iter(dataset_loader))

In [24]:
test_graph['x'].shape

torch.Size([32, 361, 40])

In [25]:
test_graph['y'].shape

torch.Size([32, 361, 40])

In [27]:
test_graph['len']

tensor([169, 221, 306, 170, 110, 208, 130, 234, 221, 120, 132, 204, 176, 247,
        154, 238, 228, 306, 228, 204, 209, 304, 170, 195, 224, 266, 228, 208,
        168, 168, 204, 240])

In [28]:
args.batch_size

32

The structure of the data is a batch of 32 graphs (dim 0), each consisting of up to 360 nodes (dim 1). Each node is represented by an adjacency vector to the nodes in the BFS frontier, with size `max previous node: 40` (dim 2).

The x and y datasets should be offset by one:

In [34]:
test_graph['x'][0, 1:, :]

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [33]:
test_graph['y'][0, 0:-1, :]

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [37]:
torch.equal(test_graph['x'][0, 1:, :], test_graph['y'][0, 0:-1, :])

True

So, the representation for each node is an adjacency vector of length 40. What are the SOS and EOS sequences?

In [39]:
test_graph['x'][0, 0, :].sum()

tensor(40., dtype=torch.float64)

In [46]:
test_graph['y'][0, test_graph['len'][0] - 1, :].sum()

tensor(0., dtype=torch.float64)

Looks like SOS is all ones and EOS is all zeros. How many total nonzero nodes are there in x and y?

In [50]:
sum(test_graph['x'][0, :, :].sum(axis = 1) != 0)

tensor(169)

In [51]:
sum(test_graph['y'][0, :, :].sum(axis = 1) != 0)

tensor(168)

As expected, 169 x and 168 y.

Are all the SOS tokens vectors of ones?

In [54]:
test_graph['x'][:, 0, :].sum(axis = 1)

tensor([40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40.,
        40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40., 40.,
        40., 40., 40., 40.], dtype=torch.float64)

Looks like it.

# Train

Train the model (or at least, try to).

In [58]:
### model initialization
## Graph RNN VAE model
# lstm = LSTM_plain(input_size=args.max_prev_node, embedding_size=args.embedding_size_lstm,
#                   hidden_size=args.hidden_size, num_layers=args.num_layers).cuda()

if 'GraphRNN_VAE_conditional' in args.note:
    rnn = GRU_plain(input_size=args.max_prev_node, embedding_size=args.embedding_size_rnn,
                    hidden_size=args.hidden_size_rnn, num_layers=args.num_layers, has_input=True,
                    has_output=False).cuda()
    output = MLP_VAE_conditional_plain(h_size=args.hidden_size_rnn, embedding_size=args.embedding_size_output, y_size=args.max_prev_node).cuda()
elif 'GraphRNN_MLP' in args.note:
    rnn = GRU_plain(input_size=args.max_prev_node, embedding_size=args.embedding_size_rnn,
                    hidden_size=args.hidden_size_rnn, num_layers=args.num_layers, has_input=True,
                    has_output=False).cuda()
    output = MLP_plain(h_size=args.hidden_size_rnn, embedding_size=args.embedding_size_output, y_size=args.max_prev_node).cuda()
elif 'GraphRNN_RNN' in args.note:
    rnn = GRU_plain(input_size=args.max_prev_node, embedding_size=args.embedding_size_rnn,
                    hidden_size=args.hidden_size_rnn, num_layers=args.num_layers, has_input=True,
                    has_output=True, output_size=args.hidden_size_rnn_output).cuda()
    output = GRU_plain(input_size=1, embedding_size=args.embedding_size_rnn_output,
                       hidden_size=args.hidden_size_rnn_output, num_layers=args.num_layers, has_input=True,
                       has_output=True, output_size=1).cuda()

### start training
train(args, dataset_loader, rnn, output)

  nn.init.xavier_uniform(param,gain=nn.init.calculate_gain('sigmoid'))
  nn.init.constant(param, 0.25)
  m.weight.data = init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx