### Developing the DataGenerator

In [59]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import Sequence

In [126]:
# Modified from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class DataGenerator(Sequence):
    '''Generates .npy files for Conv2D'''
    def __init__(self, data_dir, batch_size=32, dim=(128,640), 
                 n_channels=1, shuffle=True):
        '''
        Parameters
        ----------
        data_dir : str
            Path to data split (training, validation, or test)
        batch_size : int
            Number of files to return at a time
        dim : tuple
            Dimension of arrays to read in
        n_channels : int
            Number of color channels for image array
        shuffle : bool
            Shuffle indices between epochs
        '''
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.dim = dim
        self.n_channels = n_channels
        self.shuffle = shuffle
        
        self.label_dict = self.__get_label_dict()
        self.files = self.__get_files()
        self.n_classes = len(self.label_dict)   # Number of sub dirs
        self.on_epoch_end()                     # populates self.indexes

    def __len__(self):
        '''Denotes the number of batches per epoch'''
        return int(np.floor(len(self.files) / self.batch_size))

    def __getitem__(self, index):
        '''Generate one batch of data'''
        # Generate indexes of the batch
        idxs = self.indexes[index*self.batch_size : (index+1)*self.batch_size]

        # Find list of IDs
        file_list = self.files[idxs]

        # Generate data
        X, y = self.__data_generation(file_list)

        return X, y

    def on_epoch_end(self):
        '''Updates indexes after each epoch'''
        self.indexes = np.arange(len(self.files))
        if self.shuffle == True:
            np.random.shuffle(self.indexes) # Shuffles in place
            
    def __get_files(self):
        '''Get all files from subdirectories of data_dir'''
        subdirs = sorted(os.listdir(self.data_dir))
        all_files = []

        for subdir in subdirs:
            full_dir = os.path.join(self.data_dir, subdir)
            files = os.listdir(full_dir)
            for file in files:
                all_files.append(os.path.join(subdir, file))

        return np.array(all_files)

    def __get_label_dict(self):
        '''
        Create dict of labels from sub directories
        {Genre : int}
        '''
        subdirs = sorted(os.listdir(self.data_dir))
        labels = np.arange(len(subdirs))
        return {k:v for k,v in zip(subdirs, labels)}
    
    def __data_generation(self, file_list):
        '''
        Generates data containing batch_size samples
        
        Parameters
        ----------
        file_list : list or np.array
            List of files to retrieve/process/load
        
        Returns
        -------
        X : (n_samples, *dim, n_channels)
        
        '''  
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        for i, file in enumerate(file_list):
            npy = np.load(os.path.join(self.data_dir, file))
            target = file.split('/')[0]
            label = label_dict[target]
            X[i,] = npy[:,:,None]   # Create extra dim for channel
            y[i,] = label

        return X, to_categorical(y, num_classes=self.n_classes, dtype='int')

In [127]:
datagen = DataGenerator('data/test', batch_size=32, dim=(128,640), n_channels=1, shuffle=True)

In [140]:
len(datagen)

9

In [132]:
i = datagen.__getitem__(1)

In [139]:
i[0].shape

(32, 128, 640, 1)

In [137]:
i[1][:4]

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]])

In [138]:
datagen.label_dict

{'Hip-Hop': 0, 'Instrumental': 1, 'Rock': 2}

## Scrap

In [121]:
subdirs = sorted(os.listdir(filepath))
labels = np.arange(len(subdirs))

In [124]:
{k:v for k,v in zip(subdirs, labels)}

{'Hip-Hop': 0, 'Instrumental': 1, 'Rock': 2}

In [72]:
label_dict = {
    'Rock': 0,
    'Instrumental': 1,    
    'Hip-Hop': 2,
    'Folk': 3,
    'International': 4,            
    'Electronic': 5,
    'Experimental': 6,   
    'Pop': 7 
}

In [73]:
len(label_dict)

8

In [97]:
# Create list of filenames in a directory

filepath = 'data/test'
def __get_files(filepath):
    subdirs = os.listdir(filepath)
    all_files = []

    for subdir in subdirs:
        full_dir = os.path.join(filepath, subdir)
        files = os.listdir(full_dir)
        for file in files:
            all_files.append(os.path.join(subdir, file))

    return np.array(all_files)

In [98]:
all_files = __get_files(filepath)

In [101]:
idxs = [0,33,99]
all_files[idxs]

array(['Hip-Hop/132117.npy', 'Hip-Hop/110771.npy', 'Hip-Hop/140626.npy'],
      dtype='<U23')

In [105]:
batch_size = 5

npy = np.load(os.path.join(filepath, all_files[0]))
shape = npy.shape

channels = 1
n_classes = 2

In [112]:
def __data_generation(all_files):
    X = np.empty((batch_size, *shape, channels))
    y = np.empty((batch_size), dtype=int)
    
    for i, file in enumerate(all_files[:batch_size]):
        npy = np.load(os.path.join(filepath, file))
        target = file.split('/')[0]
        label = label_dict[target]
        X[i,] = npy[:,:,None]
        y[i,] = label
    
    return X, y
# to_categorical(y, num_classes=n_classes, dtype='int')

In [113]:
X, y = __data_generation(all_files)

In [114]:
y

array([2, 2, 2, 2, 2])

In [119]:
to_categorical(y, num_classes=5)

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [108]:
X = np.empty((batch_size, *shape, channels))

In [109]:
all_files[0].split('/')[0]

'Hip-Hop'