# Data generation

In [1]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.append(2*'../')

import numpy as np
import torch
from torch import nn

In [2]:
from rl4co.envs import TSPEnv, CVRPEnv

env = TSPEnv()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
np.random.seed(4321)


def generate_tsp_data(dataset_size, tsp_size):
    return {'locs': np.random.uniform(size=(dataset_size, tsp_size, 2)).astype(np.float32)}


x_new = generate_tsp_data(10000, 100)

# print(x_new[:10] - x[:10])
# print(x_new['locs'][:10])

# save npz
np.savez('tsp_100_10000.npz', **x_new)


In [4]:
x_old = np.load('data/tsp/tsp100_validation_seed4321.pkl', allow_pickle=True)

x = np.load('tsp_100_10000.npz')

print(np.allclose(x_old, x['locs']))

True


In [5]:
%timeit np.load('data/tsp/tsp100_validation_seed4321.pkl', allow_pickle=True)

123 ms ± 374 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
%timeit np.load('tsp_100_10000.npz')

12.9 µs ± 47.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [7]:
# print file size in a nice format
import os
import sys

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return '%3.1f%s%s' % (num, unit, suffix)
        num /= 1024.0
    return '%.1f%s%s' % (num, 'Yi', suffix)

print(sizeof_fmt(os.path.getsize('data/tsp/tsp100_validation_seed4321.pkl')))
print(sizeof_fmt(os.path.getsize('tsp_100_10000.npz')))

21.0MiB
7.6MiB


Loading is way faster with `npz` as well as more memory efficient!

## Load data to TensorDict


In [15]:
list(x_dict.keys())[0]

'locs'

In [18]:
# Tensordict from data
from tensordict.tensordict import TensorDict

x_ = dict(x)
batch_size = x_[list(x_.keys())[0]].shape[0]

td = TensorDict(dict(x), batch_size=batch_size)
td

TensorDict(
    fields={
        locs: Tensor(shape=torch.Size([10000, 100, 2]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([10000]),
    device=None,
    is_shared=False)

In [22]:
def load_npz_to_tensordict(filename):
    """Load a npz file directly into a TensorDict"""
    x = np.load(filename)
    x_dict = dict(x)
    batch_size = x_dict[list(x_dict.keys())[0]].shape[0]
    return TensorDict(x_dict, batch_size=batch_size)

td = load_npz_to_tensordict('tsp_100_10000.npz')

In [35]:
np.random.seed(4321)

def generate_vrp_data(dataset_size, vrp_size):
    CAPACITIES = {
        10: 20.,
        20: 30.,
        50: 40.,
        100: 50.
    }
    return list(zip(
        np.random.uniform(size=(dataset_size, 2)),  # Depot location
        np.random.uniform(size=(dataset_size, vrp_size, 2)),  # Node locations
        np.random.randint(1, 10, size=(dataset_size, vrp_size)),  # Demand, uniform integer 1 ... 9
        np.full(dataset_size, CAPACITIES[vrp_size])  # Capacity, same for whole dataset
    ))

x_old = generate_vrp_data(10000, 100)

In [36]:
np.random.seed(4321)

def generate_vrp_data(dataset_size, vrp_size):
    CAPACITIES = {
        10: 20.,
        20: 30.,
        50: 40.,
        100: 50.
    }
    return {'depot': np.random.uniform(size=(dataset_size, 2)).astype(np.float32),  # Depot location
            'locs': np.random.uniform(size=(dataset_size, vrp_size, 2)).astype(np.float32),  # Node locations
            'demand': np.random.randint(1, 10, size=(dataset_size, vrp_size)).astype(np.float32),  # Demand, uniform integer 1 ... 9
            'capacity': np.full(dataset_size, CAPACITIES[vrp_size]).astype(np.float32)}  # Capacity, same for whole dataset

x_new = generate_vrp_data(10000, 100)

In [37]:
print(x_old[0][0], x_new['depot'][0])

[0.07080288 0.81506401] [0.07080287 0.815064  ]


In [59]:
x_new_new = np.load('../../data/vrp/vrp100_val_seed4321.npz')
x_old = np.load('data/vrp/vrp100_validation_seed4321.pkl', allow_pickle=True)

In [60]:
x_old[0][0]

[0.07080287595563761, 0.8150640110845127]

In [61]:
x_new_new['depot'][0]


array([0.07080287, 0.815064  ], dtype=float32)