In [1]:
import os
import tqdm
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
# Select path to raw dataset that needs to be converted
data_dir = 'datasets/poisson/polygon'

A = np.load(os.path.join(data_dir, 'A.npy'), allow_pickle=True)
B = np.load(os.path.join(data_dir, 'B.npy'), allow_pickle=True)
X = np.load(os.path.join(data_dir, 'X.npy'), allow_pickle=True)

# Convert from sparse to dense numpy

In [3]:
dataset = {
    'A' : None,
    'B' : None,
    'X' : None
}

# Convert A
max_edges = 0
for i in tqdm.tqdm(range(A.shape[0])):
    a_val = A[i].data
    a_row = A[i].tocoo().row
    a_col = A[i].tocoo().col
    a_sample = np.c_[a_row, a_col, a_val]
    
    # Get maximum amount of edges in dataset:
    
    max_edges = max(max_edges, np.shape(a_sample)[0])
    
    
for i in tqdm.tqdm(range(A.shape[0])):
    a_val = A[i].data
    a_row = A[i].tocoo().row
    a_col = A[i].tocoo().col
    a_sample = np.c_[a_row, a_col, a_val]
    
    n_edges = np.shape(a_sample)[0]
    n_edges_missing = max_edges - n_edges
    d_in_A = np.shape(a_sample)[1]
    
    if n_edges_missing > 0:
        a_sample = np.r_[a_sample, np.zeros([n_edges_missing, d_in_A])]
    
    if dataset['A'] is None:
        dataset['A'] = [np.expand_dims(a_sample, 0)]
    else:
        dataset['A'].append(np.expand_dims(a_sample, 0))
dataset['A'] = np.vstack(dataset['A'])

# Convert B
dataset['B'] = np.expand_dims(B, -1)

# Convert X
dataset['X'] = np.expand_dims(X, -1)



100%|██████████| 10000/10000 [00:15<00:00, 662.53it/s]
100%|██████████| 10000/10000 [00:15<00:00, 656.87it/s]


In [4]:
dataset['A'].shape

(10000, 3044, 3)

# Split train/val/test

In [5]:
# Create Train Val and Test datasets:
n_samples = np.shape(dataset['A'])[0]

# Select proportions
# [%train, %val, %test]
proportions = {
    'train': 60, 
    'val' : 20, 
    'test' : 20
}
proportions['sum'] = int(proportions['train']+proportions['val']+proportions['test'])
    
n_train = proportions['train']*n_samples // proportions['sum']
n_val = proportions['val']*n_samples // proportions['sum']
n_test = n_samples - n_val - n_train

# Split
dataset_split = {}
for key in dataset:
    dataset_split[key+'_train'] = dataset[key][:n_train]
    dataset_split[key+'_val'] = dataset[key][n_train:n_train+n_val]
    dataset_split[key+'_test'] = dataset[key][n_train+n_val:n_train+n_val+n_test]
    
# Save
for key in dataset_split:
    np.save(os.path.join(data_dir, key+'.npy'), dataset_split[key])


# Convert to tfrecords

In [6]:

__author__ = "Sangwoong Yoon"

def np_to_tfrecords(A, B, X, file_path_prefix, verbose=True):
    """
    author : "Sangwoong Yoon"
    """
    def _dtype_feature(ndarray):
        """match appropriate tf.train.Feature class with dtype of ndarray. """
        assert isinstance(ndarray, np.ndarray)
        dtype_ = ndarray.dtype
        if dtype_ == np.float64 or dtype_ == np.float32:
            return lambda array: tf.train.Feature(float_list=tf.train.FloatList(value=array))
        elif dtype_ == np.int64:
            return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array))
        else:  
            raise ValueError("The input should be numpy ndarray. \
                               Instaed got {}".format(ndarray.dtype))
            
    assert isinstance(A, np.ndarray)
    assert len(A.shape) == 2
    
    assert isinstance(B, np.ndarray)
    assert len(B.shape) == 2
    
    assert isinstance(X, np.ndarray)
    assert len(X.shape) == 2
    
    # load appropriate tf.train.Feature class depending on dtype
    dtype_feature_a = _dtype_feature(A)
    dtype_feature_b = _dtype_feature(B)
    dtype_feature_x = _dtype_feature(X)      
        
    # Generate tfrecord writer
    result_tf_file = file_path_prefix + '.tfrecords'
    writer = tf.python_io.TFRecordWriter(result_tf_file)
    if verbose:
        print("Serializing {:d} examples into {}".format(X.shape[0], result_tf_file))
        
    # iterate over each sample,
    # and serialize it as ProtoBuf.
    for idx in tqdm.tqdm(range(A.shape[0])):
        a = A[idx]
        b = B[idx]
        x = X[idx]
        
        d_feature = {}
        d_feature['A'] = dtype_feature_a(a)
        d_feature['B'] = dtype_feature_b(b)
        d_feature['X'] = dtype_feature_x(x)
        
            
        features = tf.train.Features(feature=d_feature)
        example = tf.train.Example(features=features)
        serialized = example.SerializeToString()
        writer.write(serialized)
    
    if verbose:
        print("Writing {} done!".format(result_tf_file))

     

In [7]:
for mode in ['train', 'val', 'test']:

    A = np.load(os.path.join(data_dir, 'A_'+mode+'.npy'), allow_pickle=True)
    B = np.load(os.path.join(data_dir, 'B_'+mode+'.npy'), allow_pickle=True)
    X = np.load(os.path.join(data_dir, 'X_'+mode+'.npy'), allow_pickle=True)

    n_samples = np.array(np.shape(A)[0])
    n_edges = np.array(np.shape(A)[1])
    d_in_A = np.array(np.shape(A)[2])
    
    n_nodes = np.array(np.shape(B)[1])
    d_in_B = np.array(np.shape(B)[2])

    A = np.reshape(A, [n_samples, -1])
    B = np.reshape(B, [n_samples, -1])
    X = np.reshape(X, [n_samples, -1])

    np_to_tfrecords(A, B, X, os.path.join(data_dir, mode), 
        verbose=True)

  0%|          | 8/6000 [00:00<01:18, 76.23it/s]

Serializing 6000 examples into datasets/poisson/polygon/train.tfrecords


100%|██████████| 6000/6000 [01:05<00:00, 91.85it/s] 


Writing datasets/poisson/polygon/train.tfrecords done!


  0%|          | 7/2000 [00:00<00:28, 69.92it/s]

Serializing 2000 examples into datasets/poisson/polygon/val.tfrecords


100%|██████████| 2000/2000 [00:19<00:00, 102.10it/s]
  0%|          | 0/2000 [00:00<?, ?it/s]

Writing datasets/poisson/polygon/val.tfrecords done!
Serializing 2000 examples into datasets/poisson/polygon/test.tfrecords


100%|██████████| 2000/2000 [00:17<00:00, 115.37it/s]

Writing datasets/poisson/polygon/test.tfrecords done!



