In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import dareblopy as db
from IPython.display import Image, display
import PIL.Image

import os
import shutil
import time


S_BASE_PATH = "data/imagenet_images"
T_BASE_PATH = "data/imagenet_red"

from_path = os.path.join(os.getcwd(), S_BASE_PATH)
to_path = os.path.join(os.getcwd(), T_BASE_PATH)

IS_DEBUG = False


In [3]:

def processImagesByRatio(ratio: int, src_path : str, tar_path : str, postfix : str):
    src_path = os.path.join(src_path, postfix)

    # index-class mapping file
    filename = "index-" + postfix + ".txt"
    fileHandle = open(os.path.join(tar_path, filename), 'w')

    if not os.path.isdir(tar_path):
        os.mkdir(tar_path)
    tar_path = os.path.join(tar_path, postfix)

    if not os.path.isdir(src_path):
        print(f"No source path found for {src_path}")
        return

    if not os.path.isdir(tar_path):
        print(f"creating target path at {tar_path}")
        os.mkdir(tar_path)

    val_cl_src_paths = []
    classes = os.listdir(src_path)
    for val_class in classes:
        val_cl_src_paths.append(os.path.join(src_path, val_class))
    
    print(f"val classes: {len(val_cl_src_paths)}")
    
    # copy all images from set
    index = 0
    img_name = ""
    for class_name, src_path in zip(classes, val_cl_src_paths):
        class_img_paths = os.listdir(src_path)
        total_amount = len(class_img_paths)
        cp_amount = int(ratio * total_amount)

        print(f"Copy {cp_amount}/{total_amount} to {tar_path}")

        for i in range(0, cp_amount):
            if not os.path.isdir(tar_path):
                os.mkdir(tar_path)

            file_type = class_img_paths[i].split(".")[1]
            # remove any wired URL encoded parts from the filetype
            if len("jpg") < len(file_type): 
                file_type = file_type[:3]
            img_name = f"{index}.{file_type}"

            cp_from = os.path.join(src_path, class_img_paths[i])
            cp_to = os.path.join(tar_path, img_name)
            fileHandle.write(class_name + "\n")
            if IS_DEBUG:
                print(f"Copy: \n --{cp_from} \n" 
                        + f" ->{cp_to}")
            else:
                shutil.copyfile(cp_from, cp_to)
            
            index = index + 1
    fileHandle.close()
    return fileHandle.name

def getClassToIndexMapping(path: str):
    print(path)
    if not os.path.isfile(path):
        raise Exception
    mapping = []
    file = open(path, 'r')
    for line in file:
        mapping.append(line.replace("\n", ""))
    file.close()
    return mapping


In [4]:
from skimage import io
import numpy as np
import time
import zipfile
from PIL import Image

def transformTrainImage(img: Image) -> Image:
    trans = transforms.Compose([
        transforms.Resize(256),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    return trans(img)

def transformValImage(img: Image) -> Image:
    trans = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    return trans(img)

def transformAllImages(path, set_type: str) -> None:

    if path is None or not os.path.isdir(path):
        raise Exception
    
    if set_type == "val":
        transform = transformValImage

    elif set_type == "train":
        transform = transformTrainImage
    else:
        return

    img_name_list = os.listdir(path)

    for img_name in img_name_list:
        img_path = os.path.join(path, img_name)
        with PIL.Image.open(img_path) as img:
            if img.mode == 'L':
                img = transforms.Grayscale(num_output_channels=3)(img)
            img = transform(img)
            img = transforms.ToPILImage()(img)
            img.save(img_path)

def generateDatasetZipArchive(base_path: str, files_path: str, prefix: str, set_type: str) -> None:    
    filenames = os.listdir(files_path)
    zipPath = os.path.join(base_path, prefix + set_type + '.zip')
    if os.path.isfile(zipPath):
        os.remove(zipPath)
    
    with zipfile.ZipFile(zipPath, mode='w') as zipArch:
        for filename in filenames:
            zipArch.write(os.path.join(files_path, filename), arcname=filename)

def generateNewImageDataset(from_base: str, to_base: str, set_type: str) -> None:
    
    if not (set_type == 'val' or set_type == 'train'):
        raise Exception(f'{set_type} is not supported')
    
    target_path = os.path.join(to_path, set_type)
    # clean up to_path
    if os.path.isdir(target_path):
        if IS_DEBUG: 
            print(f"Removing existing directory: {target_path}")
        shutil.rmtree(target_path)
        os.remove(os.path.join(to_base, f"index-{set_type}.txt"))
    
    ratio = 1/8
    processImagesByRatio(ratio, from_base, to_base, set_type)
    transformAllImages(os.path.join(to_base, set_type), set_type)
    generateDatasetZipArchive(to_base, os.path.join(to_base, set_type), 'index-', set_type)



In [50]:
class ZippedDataset(torch.utils.data.Dataset):
    img_class_mapping = []
    archive = None
    class_to_label = []

    def __init__(self, arch_path: str, index_path: str, transforms=transforms.ToTensor()):
        super(ZippedDataset, self).__init__()
        if transforms is None:
            raise Exception("Transforms must be set at least to ToTensor() at the end")
        # load index
        self.img_class_mapping = getClassToIndexMapping(index_path)
        self.archive = db.open_zip_archive(arch_path)
        self.transforms = transforms
    
        self.class_to_label = list(set(self.img_class_mapping))
        self.class_to_label.sort()

    def __len__(self) -> int:
        return len(self.img_class_mapping)
    
    def __getitem__(self, index: int):
        img_data = self.archive.read_jpg_as_numpy(f'{index}.jpg')
        img_data = self.transforms(img_data)
        return (img_data, self.class_to_label.index(self.img_class_mapping[index]))


    

In [71]:
import gc
#val_idx_filename = os.path.join(T_BASE_PATH, "index-val.txt")
#train_idx_filename = os.path.join(T_BASE_PATH, "index-train.txt")

#val_mapping = getClassToIndexMapping(val_idx_filename)
#train_mapping = getClassToIndexMapping(train_idx_filename)

#print(len(val_mapping))
#print(len(set(val_mapping)))
#print(len(train_mapping))
#print(len(set(train_mapping)))

#val_arch_path = os.path.join(to_path, "imagenet_red_val.zip")

#print(val_arch_path)
#val_archive = db.open_zip_archive(val_arch_path)


#filename = processImagesByRatio(1/8, from_path, to_path, "val")
#filename = processImagesByRatio(1/8, from_path, to_path, "train")

#generateNewImageDataset(from_path, to_path, 'train')

def benchmarkDataloader()


train_dataset = ZippedDataset(os.path.join(to_path, 'index-train.zip'), os.path.join(to_path, 'index-train.txt'))


train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

#print(len(train_dataset))
#print(len(val_dataset))
#print(len(train_loader))
#print(len(loader))

MAX = 1000

val_dataset = ZippedDataset(os.path.join(to_path, 'index-val.zip'), os.path.join(to_path, 'index-val.txt'))
loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
avg = 0.0
start = time.time()
for i, (img, label) in enumerate(loader):
    if i == MAX:
        break
    else:
        stop = time.time() - start
        print(f"{i}-{len(loader)}: {img.shape} - {label} - load time: {stop} sec")
        avg = avg + stop
    start = time.time()

print(f"Avg load time {avg / len(val_dataset)} sec")

/home/alex/Documents/pythonProjects/anytimeDnn/data/imagenet_red/index-train.txt
/home/alex/Documents/pythonProjects/anytimeDnn/data/imagenet_red/index-val.txt
0-60: torch.Size([8, 3, 224, 224]) - tensor([36, 15, 16, 27, 24,  3, 20,  2]) - load time: 0.02040719985961914 sec
1-60: torch.Size([8, 3, 224, 224]) - tensor([ 1,  5, 31, 19, 38, 26,  4, 22]) - load time: 0.017916440963745117 sec
2-60: torch.Size([8, 3, 224, 224]) - tensor([10,  9, 37, 30,  1, 23, 37, 30]) - load time: 0.014299154281616211 sec
3-60: torch.Size([8, 3, 224, 224]) - tensor([10, 23, 18, 33, 21, 18, 12, 19]) - load time: 0.011503934860229492 sec
4-60: torch.Size([8, 3, 224, 224]) - tensor([14,  1, 12, 24,  5, 37, 37, 36]) - load time: 0.012126922607421875 sec
5-60: torch.Size([8, 3, 224, 224]) - tensor([36, 20, 11, 31, 37,  5, 21,  0]) - load time: 0.011279582977294922 sec
6-60: torch.Size([8, 3, 224, 224]) - tensor([16,  1, 11, 13, 35, 14, 10, 29]) - load time: 0.01688218116760254 sec
7-60: torch.Size([8, 3, 224, 2

In [69]:
gc.collect()
avg = 0.0
for i, (img, label) in enumerate(train_loader):
    if i == MAX:
        break
    else:
        stop = time.time() - start
        #print(f"{i}-{len(train_loader)}: {img.shape} - {label} - load time: {stop} sec")
        avg += stop
    start = time.time()
print(f"avg load time: {avg / len(train_dataset)} sec")

 0.01531982421875 sec
384-552: torch.Size([8, 3, 224, 224]) - tensor([32,  8,  4,  8, 39, 22, 30, 28]) - load time: 0.016425609588623047 sec
385-552: torch.Size([8, 3, 224, 224]) - tensor([32,  9, 34,  5, 19,  5, 25,  3]) - load time: 0.016783714294433594 sec
386-552: torch.Size([8, 3, 224, 224]) - tensor([29,  3, 10, 23, 15,  7, 20, 30]) - load time: 0.04177689552307129 sec
387-552: torch.Size([8, 3, 224, 224]) - tensor([33, 38, 18, 12, 10, 28, 17, 21]) - load time: 0.015385627746582031 sec
388-552: torch.Size([8, 3, 224, 224]) - tensor([28, 19, 21, 33,  4, 27, 39, 29]) - load time: 0.0161740779876709 sec
389-552: torch.Size([8, 3, 224, 224]) - tensor([ 0,  7, 13,  7, 32,  5, 18,  6]) - load time: 0.013994932174682617 sec
390-552: torch.Size([8, 3, 224, 224]) - tensor([16, 24, 17, 25, 12, 34, 38, 38]) - load time: 0.015782833099365234 sec
391-552: torch.Size([8, 3, 224, 224]) - tensor([39, 17, 24, 23,  4,  3, 26, 14]) - load time: 0.014388322830200195 sec
392-552: torch.Size([8, 3, 22