## Imports

In [1]:
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler

## Load Data

In [2]:
all_genres = ['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']
num_songs = 100
sr = 22050
Y_LIMIT = 660000

In [3]:
class DataLoader():
    def __init__(self):
        '''
        Initializes the DataLoader.
        
        Forces deteriminism by setting np.seed=1.
        
        self.SEG is the number of fixed-length segments, and the number of random crops to take
        self.SEG_LENGTH is the length of each fixed-length segment
        self.RANDOM_SEG_LENGTH is the length of each random crop
        
        self.train_idxs is a list of 750 training indices
        self.test_idxs is a list of 250 testing indices
        
        self.train_crop_idxs is a list of 750*self.SEG training indices, corresponding to the same indices above
        self.test_crop_idxs  is a list of 750*self.SEG testing indices,  corresponding to the same indices above
        
        eg. train_idxs      = [1, 2, 4, ...]
            train_crop_idxs = [10 ... 19, 20 ... 29, 40 ... 49, ...]
            
        Cropped X's retain the order of the uncropped X's
        i.e. the first 10 self.mfcc_fixed_crop entries correspond to the first self.mfcc entry.
        '''
        np.random.seed(1)
        self.X_mfcc = None
        self.X_mfcc_random_crop = None
        self.X_mfcc_fixed_crop = None
        
        self.X_chroma = None
        self.X_chroma_random_crop = None
        self.X_chroma_fixed_crop = None
        
        self.Y = None
        self.Y_crop = None
        
        self.SEG = 10   # Must evenly divide 30
        self.SEG_LENGTH = int(1290/self.SEG)
        self.RANDOM_SEG_LENGTH = 1200
        self.PAD = 1290 - self.RANDOM_SEG_LENGTH
        self.RANDOM_STARTS = np.random.randint(low=0, high=1290-self.RANDOM_SEG_LENGTH, size=(1000, self.SEG))
        
        self.train_idxs = np.sort(np.random.choice(np.array([i for i in range(1000)]), size=750, replace=False))
        self.test_idxs = np.array([i for i in range(1000) if i not in self.train_idxs])
        
        self.train_crop_idxs = np.hstack([np.array([i*self.SEG+j for j in range(self.SEG)]) for i in self.train_idxs])
        self.test_crop_idxs = np.hstack([np.array([i*self.SEG+j for j in range(self.SEG)]) for i in self.test_idxs])
        
    def save_mfcc(self, genres=all_genres, songs=num_songs):
        '''
        Saves MFCC Coefficients.
        Produces a 1000 x 16770 array.
        '''
        assert(self.X_mfcc is None)
        X_mfcc = None
        for g_idx, g in enumerate(genres):
            for s_idx in range(songs):
                y, sr = librosa.load(f'genres/{g}/{g}.000{s_idx:02d}.wav')
                y = y[:Y_LIMIT]
                mfcc = librosa.feature.mfcc(y, sr=sr, hop_length=512, n_mfcc=13).flatten()
                if X_mfcc is None:
                    X_mfcc = mfcc.reshape(1, mfcc.shape[0])
                else:
                    X_mfcc = np.vstack([X_mfcc, mfcc])
        scaler = StandardScaler()
        self.X_mfcc = scaler.fit_transform(X_mfcc)
        np.savetxt('data/X_mfcc.csv', self.X_mfcc)
    
    def save_mfcc_random_crop(self):
        '''
        Saves 10 random crops of MFCC for every original training sample.
        
        Produces a 10000 x 13 x 1200 array, padded with zeros to 10000 x 13 x 1290.
        Reshapes into 10000*16770 for the CSV.
        '''
        assert(self.X_mfcc_random_crop is None and self.X_mfcc is not None)
        X_mfcc_crop = None
        for i, mfcc in enumerate(self.X_mfcc):
            crop = None
            for j in range(self.SEG):
                random_start = self.RANDOM_STARTS[i][j]
                random_seg = np.vstack([mfcc[1290*k+random_start : 1290*k+random_start+self.RANDOM_SEG_LENGTH] for k in range(13)])
                random_seg = np.pad(random_seg, ((0, 0), (0, self.PAD)), 'constant')
                random_seg = random_seg.reshape(1, random_seg.shape[0], random_seg.shape[1])
                if crop is None:
                    crop = random_seg
                else:
                    crop = np.vstack([crop, random_seg])
            if X_mfcc_crop is None:
                X_mfcc_crop = crop
            else:
                X_mfcc_crop = np.vstack([X_mfcc_crop, crop])
        self.X_mfcc_random_crop = X_mfcc_crop
        np.savetxt('data/X_mfcc_random_crop.csv', X_mfcc_crop.reshape(1000*self.SEG, 13*1290))
    
    def save_mfcc_fixed_crop(self):
        '''
        Saves 10 even segments of MFCC for every original training sample.
        
        Produces a 10000 x 13 x 129 array of MFCC coefficients for the segments.
        Reshapes into 10000*1677 for the CSV.
        '''
        assert(self.X_mfcc_fixed_crop is None and self.X_mfcc is not None)
        X_mfcc_crop = None
        for mfcc in self.X_mfcc:
            crop = np.stack([np.vstack([mfcc[1290*j+self.SEG_LENGTH*i : 1290*j+self.SEG_LENGTH*(i+1)] for j in range(13)]) for i in range(self.SEG)], axis=0)
            if X_mfcc_crop is None:
                X_mfcc_crop = crop
            else:
                X_mfcc_crop = np.vstack([X_mfcc_crop, crop])
        self.X_mfcc_fixed_crop = X_mfcc_crop
        np.savetxt('data/X_mfcc_fixed_crop.csv', X_mfcc_crop.reshape(1000*self.SEG, 13*self.SEG_LENGTH))
    
    def save_chroma(self, genres=all_genres, songs=num_songs):
        '''
        Saves Chromas.
        Produces a 1000 x 15480 array.
        '''
        assert(self.X_chroma is None)
        X_chroma = None
        for g_idx, g in enumerate(genres):
            for s_idx in range(songs):
                y, sr = librosa.load(f'genres/{g}/{g}.000{s_idx:02d}.wav')
                y = y[:Y_LIMIT]
                chroma = librosa.feature.chroma_cqt(y, sr=sr, hop_length=512).flatten()
                if X_chroma is None:
                    X_chroma = chroma.reshape(1, chroma.shape[0])
                else:
                    X_chroma = np.vstack([X_chroma, chroma])
        scaler = StandardScaler()
        self.X_chroma = scaler.fit_transform(X_chroma)
        np.savetxt('data/X_chroma.csv', self.X_chroma)

    def save_chroma_random_crop(self):
        '''
        Saves 10 random crops of Chromas for every original training sample.
        
        Produces a 10000 x 12 x 1200 array, padded with zeros to 10000 x 12 x 1290.
        Reshapes into 10000*15480 for the CSV.
        '''
        assert(self.X_chroma_random_crop is None and self.X_chroma is not None)
        X_chroma_crop = None
        for i, chroma in enumerate(self.X_chroma):
            crop = None
            for j in range(self.SEG):
                random_start = self.RANDOM_STARTS[i][j]
                random_seg = np.vstack([chroma[1290*k+random_start : 1290*k+random_start+self.RANDOM_SEG_LENGTH] for k in range(12)])
                random_seg = np.pad(random_seg, ((0, 0), (0, self.PAD)), 'constant')
                random_seg = random_seg.reshape(1, random_seg.shape[0], random_seg.shape[1])
                if crop is None:
                    crop = random_seg
                else:
                    crop = np.vstack([crop, random_seg])
            if X_chroma_crop is None:
                X_chroma_crop = crop
            else:
                X_chroma_crop = np.vstack([X_chroma_crop, crop])
        self.X_chroma_random_crop = X_chroma_crop
        np.savetxt('data/X_chroma_random_crop.csv', X_chroma_crop.reshape(1000*self.SEG, 12*1290))
        
    def save_chroma_fixed_crop(self):
        '''
        Saves 10 even segments of Chromas for every original training sample.
        
        Produces a 10000 x 12 x 129 array of MFCC coefficients for the segments.
        Reshapes into 10000*1548 for the CSV.
        '''
        assert(self.X_chroma_fixed_crop is None and self.X_chroma is not None)
        X_chroma_crop = None
        for chroma in self.X_chroma:
            crop = np.stack([np.vstack([chroma[1290*j+self.SEG_LENGTH*i : 1290*j+self.SEG_LENGTH*(i+1)] for j in range(12)]) for i in range(self.SEG)], axis=0)
            if X_chroma_crop is None:
                X_chroma_crop = crop
            else:
                X_chroma_crop = np.vstack([X_chroma_crop, crop])
        self.X_chroma_fixed_crop = X_chroma_crop
        np.savetxt('data/X_chroma_fixed_crop.csv', X_chroma_crop.reshape(1000*self.SEG, 12*self.SEG_LENGTH))
    
    '''
    If X_mfcc has been saved, but we aborted before saving X_mfcc_random_crop (or X_mfcc_fixed_crop), 
    we can call load_mfcc with tensor=False to load the MFCC in 2D and then call dl.save_random_crop().
    Note that all load functions reshape into tensors by default.
    '''  
    
    def load_mfcc(self, tensor=True):
        self.X_mfcc = np.loadtxt('data/X_mfcc.csv')
        if tensor:
            self.X_mfcc = self.X_mfcc.reshape(1000, 13, 1290)
    
    def load_mfcc_random_crop(self):
        self.X_mfcc_random_crop = np.loadtxt('data/X_mfcc_random_crop.csv').reshape(1000*self.SEG, 13, 1290)
        
    def load_mfcc_fixed_crop(self):
        self.X_mfcc_fixed_crop = np.loadtxt('data/X_mfcc_fixed_crop.csv').reshape(1000*self.SEG, 13, self.SEG_LENGTH)
        
    def load_chroma(self, tensor=True):
        self.X_chroma = np.loadtxt('data/X_chroma.csv')
        if tensor:
            self.X_chroma = self.X_chroma.reshape(1000, 12, 1290)
    
    def load_chroma_random_crop(self):
        self.X_chroma_random_crop = np.loadtxt('data/X_chroma_random_crop.csv').reshape(1000*self.SEG, 12, 1290)
        
    def load_chroma_fixed_crop(self):
        self.X_chroma_fixed_crop = np.loadtxt('data/X_chroma_fixed_crop.csv').reshape(1000*self.SEG, 12, self.SEG_LENGTH)
    
    def load_Y(self):
        self.Y = np.array([int(i/100) for i in range(1000)]).ravel()
        
    def load_Y_crop(self):
        self.Y_crop = np.array([int(i/(100 * self.SEG)) for i in range(1000 * self.SEG)]).ravel()
        
    def train_test_split(self, data, is_cropped):
        '''
        Splits an X_data into train and test sets.
        
        is_cropped=True for splitting random or fixed crops, iscropped=False for splitting original mfcc/chroma
        
        Training and test indices are consistent every time train_test_split is called.
        
        Furthermore, cropped train and test indices are returned so that batches of 10 are together, 
        and these batches of 10 correspond to the non-cropped indices
        '''
        if is_cropped:
            return np.take(data, self.train_crop_idxs, 0), np.take(data, self.test_crop_idxs, 0)
        else:
            return np.take(data, self.train_idxs, 0), np.take(data, self.test_idxs, 0)

## Save and Load

In [4]:
%%time
# # Saves features to data/...  (run once!!)
dl = DataLoader()
dl.save_mfcc()
dl.save_mfcc_fixed_crop()
dl.save_mfcc_random_crop()
dl.save_chroma()
dl.save_chroma_fixed_crop()
dl.save_chroma_random_crop()

CPU times: user 43min, sys: 4min 59s, total: 47min 59s
Wall time: 15min 51s


In [4]:
%%time
# # Load from CSVs in data/...
dl = DataLoader()
dl.load_mfcc()
dl.load_mfcc_fixed_crop()
dl.load_mfcc_random_crop()
dl.load_chroma()
dl.load_chroma_fixed_crop()
dl.load_chroma_random_crop()
dl.load_Y()
dl.load_Y_crop()

CPU times: user 4min 42s, sys: 16.5 s, total: 4min 59s
Wall time: 6min 10s


In [5]:
dl.X_mfcc.shape, dl.X_mfcc_random_crop.shape, dl.X_mfcc_fixed_crop.shape, dl.X_chroma.shape, dl.X_chroma_random_crop.shape, dl.X_chroma_fixed_crop.shape

((1000, 13, 1290),
 (10000, 13, 1290),
 (10000, 13, 129),
 (1000, 12, 1290),
 (10000, 12, 1290),
 (10000, 12, 129))

In [6]:
dl.Y.shape, dl.Y_crop.shape

((1000,), (10000,))

## Training and Test sets for each part

In [7]:
# Train on X_train (750),
# Test on X_test (250)
print('Train and test the vanilla model\n')

X_mfcc_train, X_mfcc_test = dl.train_test_split(dl.X_mfcc, is_cropped=False)
X_chroma_train, X_chroma_test = dl.train_test_split(dl.X_chroma, is_cropped=False)

X_train = np.concatenate([X_mfcc_train, X_chroma_train], axis=1)
X_test = np.concatenate([X_mfcc_test, X_chroma_test], axis=1)

print(f'Train size:        {X_train.shape}')
print(f'Test size:         {X_test.shape}')

Y_train, Y_test = dl.train_test_split(dl.Y, is_cropped=False)

print(f'Train labels size: {Y_train.shape}')
print(f'Test labels size:  {Y_test.shape}')

Train and test the vanilla model

Train size:        (750, 25, 1290)
Test size:         (250, 25, 1290)
Train labels size: (750,)
Test labels size:  (250,)


In [8]:
# Train on X_mfcc_fixed_crop_train (7500),
# Test on X_mfcc_fixed_crop_test (2500),
# Aggregate into predictions on X_mfcc_test (250)
print('Train on segments, test by accumulating votes of segments\n')

X_mfcc_fixed_crop_train, X_mfcc_fixed_crop_test = dl.train_test_split(dl.X_mfcc_fixed_crop, is_cropped=True)
X_chroma_fixed_crop_train, X_chroma_fixed_crop_test = dl.train_test_split(dl.X_chroma_fixed_crop, is_cropped=True)

X_fixed_crop_train = np.concatenate([X_mfcc_fixed_crop_train, X_chroma_fixed_crop_train], axis=1)
X_fixed_crop_test = np.concatenate([X_mfcc_fixed_crop_test, X_chroma_fixed_crop_test], axis=1)

print(f'Train size:        {X_fixed_crop_train.shape}')
print(f'Test size:         {X_fixed_crop_test.shape}')

Y_crop_train, _ = dl.train_test_split(dl.Y_crop, is_cropped=True)
print(f'Train labels size: {Y_crop_train.shape}')
print(f'Test labels size:  {Y_test.shape}')

Train on segments, test by accumulating votes of segments

Train size:        (7500, 25, 129)
Test size:         (2500, 25, 129)
Train labels size: (7500,)
Test labels size:  (250,)


In [9]:
# Train on X_mfcc_random_crop_train (7500),
# Test on X_mfcc_test (250)
print('Train on padded random crops, test on vanilla test\n')

X_mfcc_random_crop_train, _ = dl.train_test_split(dl.X_mfcc_random_crop, is_cropped=True)
X_chroma_random_crop_train, _ = dl.train_test_split(dl.X_chroma_random_crop, is_cropped=True)

X_random_crop_train = np.concatenate([X_mfcc_random_crop_train, X_chroma_random_crop_train], axis=1)

print(f'Train size:        {X_random_crop_train.shape}')
print(f'Test size:         {X_test.shape}')

Y_crop_train, Y_crop_test = dl.train_test_split(dl.Y_crop, is_cropped=True)
print(f'Train labels size: {Y_crop_train.shape}')
print(f'Test labels size:  {Y_test.shape}')

Train on padded random crops, test on vanilla test

Train size:        (7500, 25, 1290)
Test size:         (250, 25, 1290)
Train labels size: (7500,)
Test labels size:  (250,)
