In [24]:
import os
import numpy as np

from PIL import Image
from glob import glob
from tqdm import tqdm

In [None]:
img_dir = "/gpfs/jsh/volume/datasets/imagenet-sample"
npy_dir = "/gpfs/jsh/volume/datasets/imagenet-sample-numpy"
count = 0

for tv in ["train", "val"]:
    labels = []
    os.makedirs(f"{npy_dir}/{tv}", exist_ok=True)

    for idx, img_path in tqdm(enumerate(glob(f"{img_dir}/{tv}/**/*.JPEG"))):
        img = Image.open(img_path)
        # Image
        img_array = np.asarray(img)
        # img_array = np.moveaxis(img_array, -1, 0)  # fn.crop_mirror_normalize 인자로 output_layout="CHW"를 주면 됨
        if len(img_array.shape) == 2:  # 흑백 이미지 스킵
            continue
        file = img_path.split("/")[-1]
        file = file.replace("JPEG", "image.npy")
        np.save(f"{npy_dir}/{tv}/{file}", img_array)
        count += 1
        # Label
        label = img_path.split("/")[-2]
        if not label in labels:
            labels.append(label)
        file = file.replace("image.npy", "label.npy")
        np.save(f"{npy_dir}/{tv}/{file}", np.array([len(labels)]))

count

In [23]:
for npy_path in glob(f"{npy_dir}/train/*.label.npy"):
    img_array = np.load(npy_path)
    print(npy_path, img_array.shape)

/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_13924.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_13121.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_2817.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_23440.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01484850_19861.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_15846.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_12814.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01484850_8302.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01484850_14162.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01484850_1120.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_13692.label.npy (1,)
/gpfs/jsh/volume/datasets/imagenet-sample-numpy/train/n01443537_2336

In [1]:
import time
from nvidia.dali import pipeline_def, fn

In [2]:
@pipeline_def(batch_size=1, num_threads=3, device_id=0)
def create_pipe(device, tv):
    images = fn.readers.numpy(
        device=device,
        file_root=f"/gpfs/jsh/volume/datasets/imagenet-sample-numpy/{tv}",
        file_filter="*.image.npy",
        name="ImageReader",
    )
    labels = fn.readers.numpy(
        device=device,
        file_root=f"/gpfs/jsh/volume/datasets/imagenet-sample-numpy/{tv}",
        file_filter="*.label.npy",
        name="LabelReader",
    )
    return images, labels


def run_pipe(device, tv, idx):
    pipe = create_pipe(device, tv)
    pipe.build()
    data_size = list(pipe.epoch_size().values())[0]
    start_time = time.time()
    for _ in range(data_size):
        pipe_out = pipe.run()
        # if device =="cpu":
        #     img = pipe_out[0].as_array()
        #     label = pipe_out[1].as_array()
        # else:
        #     img = pipe_out[0].as_cpu().as_array()
        #     label = pipe_out[1].as_cpu().as_array()
    print(f"elapsed time [{device}-{tv}-{idx+1}]: {time.time() - start_time}")


for idx in range(1):
    device = "gpu"
    tv = "train"
    run_pipe(device, tv, idx)
    tv = "val"
    run_pipe(device, tv, idx)

    device = "cpu"
    tv = "train"
    run_pipe(device, tv, idx)
    tv = "val"
    run_pipe(device, tv, idx)

elapsed time [gpu-train-1]: 10.803786516189575
elapsed time [gpu-val-1]: 0.39374303817749023
elapsed time [cpu-train-1]: 9.387659311294556
elapsed time [cpu-val-1]: 0.37669920921325684


In [3]:
@pipeline_def(batch_size=1, num_threads=3, device_id=0)
def create_pipe(device, tv):
    images = fn.readers.numpy(
        device=device,
        file_root=f"/ontap/jsh/volume/datasets/imagenet-sample-numpy/{tv}",
        file_filter="*.image.npy",
        name="ImageReader",
    )
    labels = fn.readers.numpy(
        device=device,
        file_root=f"/ontap/jsh/volume/datasets/imagenet-sample-numpy/{tv}",
        file_filter="*.label.npy",
        name="LabelReader",
    )
    return images, labels


def run_pipe(device, tv, idx):
    pipe = create_pipe(device, tv)
    pipe.build()
    data_size = list(pipe.epoch_size().values())[0]
    start_time = time.time()
    for _ in range(data_size):
        pipe_out = pipe.run()
        # if device =="cpu":
        #     img = pipe_out[0].as_array()
        #     label = pipe_out[1].as_array()
        # else:
        #     img = pipe_out[0].as_cpu().as_array()
        #     label = pipe_out[1].as_cpu().as_array()
    print(f"elapsed time [{device}-{tv}-{idx+1}]: {time.time() - start_time}")


for idx in range(1):
    device = "gpu"
    tv = "train"
    run_pipe(device, tv, idx)
    tv = "val"
    run_pipe(device, tv, idx)

    device = "cpu"
    tv = "train"
    run_pipe(device, tv, idx)
    tv = "val"
    run_pipe(device, tv, idx)

elapsed time [gpu-train-1]: 10.365849256515503
elapsed time [gpu-val-1]: 0.7734766006469727
elapsed time [cpu-train-1]: 3.832213878631592
elapsed time [cpu-val-1]: 0.15192127227783203
