In [1]:
"""
Script defining EvilMouDataSet Class and loaders to be used along with VAE model.
"""
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import h5py
from pathlib import Path

In [2]:
class EvilMouDataSet(Dataset):
    """
    Defines EvilMouDataSet to be used with VAE model.
    This is NOT efficient at all in terms of mem usage.
    You might wish to do something like having your dset be list of .h5 files
    And then in the __getitem__ method pick only file and corresponding slices you want.
    """
    def __init__(self, data_dir, transform=None):
        #collect all file names for files containing different frames
        all_frames = []
        for frame in Path(data_dir).rglob('frame*.h5'):
            all_frames.append(str(frame))
        #get first set of frames
        #load them
        f0 = h5py.File(all_frames[0], 'r+')
        all_data = f0['cam1'][:]
        #now read in rest of them and concatenate them over last axis
        #this should give an array with all frames in dset (160, 120, 89900)
        for i in range(1, len(all_frames)):
            f = h5py.File(all_frames[i], 'r+')
            f_data = f['cam1'][:]
            all_data = np.concatenate((all_data, f_data), axis=2)
        self.df = all_data
        self.max = np.amax(all_data.flatten())
        self.min = np.amin(all_data.flatten())
        self.mean = np.mean(all_data.flatten())
        self.transform = transform
    def __len__(self):
        """
        Returns number of samples in dset
        """
        return (int(self.df.shape[2] - 31))
    def __getitem__(self, idx):
        """
        Returns a single sample from dset.
        """
        time_start = idx
        time_end = time_start + 31 #am picking 31 frames at time here, this might be too much for your data!
        frame = self.df[:, :, time_start:time_end]
        scld_frame = np.true_divide((frame - self.min), (self.max - self.min)) #min/max norm (global)
        sample = {'frame': scld_frame}
        if self.transform:
            sample = self.transform(sample)
        return sample

class ToTensor(object):
    """
    Converts sample arrays to tensor which can be directly fed to model
    """
    def __call__(self, sample):
        frame = sample['frame']
        return {'frame':torch.from_numpy(frame).float()}

def setup_data_loaders(batch_size=64, shuffle=(True, False), data_dir=''):
    #for now am not worried about train/test splitting
    #like we talked, these are not as useful for model eval as in supervised settings
    dset = EvilMouDataSet(data_dir=data_dir, transform = ToTensor())
    train_loader = DataLoader(dset, batch_size=batch_size, \
    shuffle=shuffle[0], num_workers = 0)
    test_loader = DataLoader(dset, batch_size=batch_size, \
    shuffle=shuffle[1], num_workers=0)
    return{'train':train_loader, 'test':test_loader, 'dset':dset}
