# compare time of different read data methods

In [1]:
import numpy as np
import pandas as pd
import pptk
import datetime
import torch

## use numpy ndarray read data

In [4]:
file = "./data/arch/Train/1_TR_cloister.txt"
time3 = datetime.datetime.now()
data = np.loadtxt(file)
scene_points = data[:,0:3].astype('float32')
segment_label = data[:,6].astype('int64')
time4 = datetime.datetime.now()
print(scene_points.shape)
print(segment_label.shape)
print(time4-time3)

(15740229, 3)
(15740229,)
0:03:34.670069


In [19]:
import numpy as np
choice = np.random.choice(len(segment_label), 1024, replace=True)
point_set = scene_points[choice, :]

NameError: name 'segment_label' is not defined

## use numpy read file + use torch read data

In [3]:
file = "./data/arch/Train/1_TR_cloister.txt"
time1 = datetime.datetime.now()
data = np.loadtxt(file)
scene_points = torch.from_numpy(data[:, 0:3].astype('float32'))
segment_label = torch.from_numpy(data[:,6].astype('int32'))
time2 = datetime.datetime.now()
print(scene_points.shape)
print(segment_label.shape)
print(time2-time1)

torch.Size([15740229, 3])
torch.Size([15740229])
0:03:52.921357


## Test Block - random sample

In [1]:
range(2048)

range(0, 2048)

In [29]:
def sample_data(data, num_sample):
    """ data is in N x ...
        we want to keep num_samplexC of them.
        if N > num_sample, we will randomly keep num_sample of them.
        if N < num_sample, we will randomly duplicate samples.
    """
    N = data.shape[0]
    if (N == num_sample):
        return data, range(N)
    elif (N > num_sample):
        sample = np.random.choice(N, num_sample)
        return data[sample, ...], sample
    else:
        sample = np.random.choice(N, num_sample-N)
        dup_data = data[sample, ...]
        return np.concatenate([data, dup_data], 0), list(range(N))+list(sample)

In [9]:
def sample_data_label(data, label, num_sample):
    new_data, sample_indices = sample_data(data, num_sample)
    new_label = label[sample_indices]
    return new_data, new_label

In [5]:
import numpy as np
data = np.random.random((2080,3))
data

array([[0.35296084, 0.72715114, 0.40660786],
       [0.79695103, 0.35178304, 0.1483732 ],
       [0.02184964, 0.89369391, 0.21384761],
       ...,
       [0.72761692, 0.63170081, 0.70812537],
       [0.58366183, 0.69593253, 0.49909007],
       [0.46816175, 0.91908899, 0.4233715 ]])

In [11]:
label = np.random.randint(0,10,size=[2080,1])
label

array([[2],
       [1],
       [0],
       ...,
       [3],
       [5],
       [7]])

In [13]:
block_data_sampled, block_label_sampled = sample_data_label(data, label, 2048)

In [18]:
print(block_data_sampled.shape)
print(block_label_sampled.shape)

(2048, 3)
(2048, 1)


In [19]:
data2 = np.random.random((2000,3))
data2

array([[0.364098  , 0.30908428, 0.63440443],
       [0.30548676, 0.40795002, 0.67965822],
       [0.52483172, 0.76652738, 0.86369489],
       ...,
       [0.16839643, 0.47371034, 0.72023846],
       [0.05214054, 0.68377316, 0.61206864],
       [0.47624035, 0.64036852, 0.56606007]])

In [20]:
label2 = np.random.randint(0,10,size=[2000,1])

In [30]:
block_data2_sampled, block_label2_sampled = sample_data_label(data2, label2, 2048)

In [31]:
print(block_data2_sampled.shape)
print(block_label2_sampled.shape)

(2048, 3)
(2048, 1)


In [40]:
current_data = np.tile(block_data2_sampled, (93,1,1))
current_data.shape

(93, 2048, 3)

In [46]:
file_size = current_data.shape[0]
batch_size = 8
num_batches = file_size//batch_size
all_data = []

print(num_batches)
for batch_idx in range(num_batches):
    #if num_batches == file_size
    start_idx = batch_idx*8
    end_idx = (batch_idx+1)*8
    all_data.append(current_data[start_idx:end_idx, :, :])
all_data = np.array(all_data)
print(all_data.shape)

11
(11, 8, 2048, 3)


## Test block - gen batch to hdf5 file

In [53]:
NUM_POINT = 2048
H5_BATCH_SIZE = 1000
data_dim = [NUM_POINT, 3]
label_dim = [NUM_POINT]

batch_data_dim = [H5_BATCH_SIZE] + data_dim
batch_label_dim = [H5_BATCH_SIZE] + label_dim
h5_batch_data = np.zeros(batch_data_dim, dtype = np.float32)
h5_batch_label = np.zeros(batch_label_dim, dtype = np.uint8)
buffer_size = 0  # state: record how many samples are currently in buffer
h5_index = 0 # state: the next h5 file to save

print([H5_BATCH_SIZE])
print(batch_data_dim)
print(h5_batch_data.shape)
print(h5_batch_label.shape)

[1000]
[1000, 2048, 3]
(1000, 2048, 3)
(1000, 2048)


In [54]:
data_size = current_data.shape[0]
h5_batch_data[buffer_size:buffer_size+data_size, ...] = current_data
buffer_size += data_size

print(h5_batch_data.shape)

(1000, 2048, 3)


In [55]:
h5_batch_data

array([[[0.36409798, 0.3090843 , 0.6344044 ],
        [0.30548677, 0.40795   , 0.67965823],
        [0.5248317 , 0.76652735, 0.8636949 ],
        ...,
        [0.20645209, 0.01392364, 0.14218394],
        [0.22136463, 0.86750555, 0.24099368],
        [0.46171084, 0.4232767 , 0.58267117]],

       [[0.36409798, 0.3090843 , 0.6344044 ],
        [0.30548677, 0.40795   , 0.67965823],
        [0.5248317 , 0.76652735, 0.8636949 ],
        ...,
        [0.20645209, 0.01392364, 0.14218394],
        [0.22136463, 0.86750555, 0.24099368],
        [0.46171084, 0.4232767 , 0.58267117]],

       [[0.36409798, 0.3090843 , 0.6344044 ],
        [0.30548677, 0.40795   , 0.67965823],
        [0.5248317 , 0.76652735, 0.8636949 ],
        ...,
        [0.20645209, 0.01392364, 0.14218394],
        [0.22136463, 0.86750555, 0.24099368],
        [0.46171084, 0.4232767 , 0.58267117]],

       ...,

       [[0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0