In [19]:
import pandas as pd
import numpy as np
import h5py
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [20]:
csv_path = 'iris.data'
num_lines = 150
num_features = 4
class_dict = {'Iris-setosa': 0,
              'Iris-versicolor': 1,
              'Iris-virginica': 2}
chunksize = 10
with h5py.File('iris.h5', 'w') as h5f:
    dset1 = h5f.create_dataset('features',
                               shape=(num_lines, num_features),
                               compression=None,
                               dtype='float32')
    dset2 = h5f.create_dataset('labels',
                               shape=(num_lines,),
                               compression=None,
                               dtype='int32')
    for i in range(0, num_lines, chunksize):  

        df = pd.read_csv(csv_path, header=None, nrows=chunksize, skiprows=i)          
        df[4] = df[4].map(class_dict)
        features = df.values[:, :4]
        labels = df.values[:, -1]
        dset1[i:i+10, :] = features
        dset2[i:i+10] = labels[0]

OSError: Unable to create file (unable to truncate a file which is already open)

In [21]:
with h5py.File('iris.h5', 'r') as h5f:
    print(h5f['features'].shape)
    print(h5f['labels'].shape)

(150, 4)
(150,)


In [14]:
with h5py.File('iris.h5', 'r') as h5f:
    print('Features of entry no. 99:', h5f['features'][99])
    print('Class label of entry no. 99:', h5f['labels'][99])

Features of entry no. 99: [5.7 2.8 4.1 1.3]
Class label of entry no. 99: 1


In [15]:
class Hdf5Dataset(Dataset):
    def __init__(self, h5_path, transform=None):    
        self.h5f = h5py.File(h5_path, 'r')
        self.num_entries = self.h5f['labels'].shape[0]
        self.transform = transform
        
    def __getitem__(self, index):        
        features = self.h5f['features'][index]
        label = self.h5f['labels'][index]
        if self.transform is not None:
            features = self.transform(features)
        return features, label
    
    def __len__(self):
        return self.num_entries

In [16]:
train_dataset = Hdf5Dataset(h5_path='iris.h5',transform=None)
train_loader = DataLoader(dataset=train_dataset,batch_size=16,
                          shuffle=True,num_workers=4)

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)
num_epochs = 5
for epoch in range(num_epochs):
    for batch_idx, (x, y) in enumerate(train_loader):        
        print('Epoch:', epoch+1, end='')
        print(' | Batch index:', batch_idx, end='')
        print(' | Batch size:', y.size()[0])        
        x = x.to(device)
        y = y.to(device)

Epoch: 1 | Batch index: 0 | Batch size: 16
Epoch: 1 | Batch index: 1 | Batch size: 16
Epoch: 1 | Batch index: 2 | Batch size: 16
Epoch: 1 | Batch index: 3 | Batch size: 16
Epoch: 1 | Batch index: 4 | Batch size: 16
Epoch: 1 | Batch index: 5 | Batch size: 16
Epoch: 1 | Batch index: 6 | Batch size: 16
Epoch: 1 | Batch index: 7 | Batch size: 16
Epoch: 1 | Batch index: 8 | Batch size: 16
Epoch: 1 | Batch index: 9 | Batch size: 6
Epoch: 2 | Batch index: 0 | Batch size: 16
Epoch: 2 | Batch index: 1 | Batch size: 16
Epoch: 2 | Batch index: 2 | Batch size: 16
Epoch: 2 | Batch index: 3 | Batch size: 16
Epoch: 2 | Batch index: 4 | Batch size: 16
Epoch: 2 | Batch index: 5 | Batch size: 16
Epoch: 2 | Batch index: 6 | Batch size: 16
Epoch: 2 | Batch index: 7 | Batch size: 16
Epoch: 2 | Batch index: 8 | Batch size: 16
Epoch: 2 | Batch index: 9 | Batch size: 6
Epoch: 3 | Batch index: 0 | Batch size: 16
Epoch: 3 | Batch index: 1 | Batch size: 16
Epoch: 3 | Batch index: 2 | Batch size: 16
Epoch: 3 | Ba