In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, batch_size=32, shuffle=True):
        """
        Directory contains an "images" folder and a "outputs" folder.
        each image has name formatted like "00000001.jpg"
        each label has name formatted like "output00000001.csv"
        """
        self.batch_size = batch_size
        self.df = df
        self.indices = range(len(df))
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return len(self.df) // self.batch_size

    def __getitem__(self, index):
        indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        
        return self.get_data(indices)

    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def get_index(self, idx):
        row = df.iloc[idx]
        X = np.array(Image.open(row.image)) / 255.0
        y = pd.read_csv(row.label).values[:, 1:].flatten()
        
        return X, y
    
    def get_data(self, indices):
        X = np.empty((len(indices), HEIGHT, WIDTH, 3))
        y = np.empty((len(indices), 36))
        
        for i, index in enumerate(indices):
            X[i], y[i] = get_index(index)

        return X, y
    
    @staticmethod
    def gen_df(directory):
        images, labels = [], []
        for image in glob(os.path.join(directory, 'images/*')):
            images.append(image)
            num = os.path.basename(image).split('.jpg')[0]
            labels.append(os.path.join(directory, f'labels/{num}.csv'))
        return pd.DataFrame({'image': images, 'label': labels})
    
    @staticmethod
    def splits(df, train_size=0.5):
        train, test = train_test_split(df, train_size=train_size)
        return DataGenerator(train), DataGenerator(test)