Merge pull request #61 from bmcfee/feature-alignment

normalized feature durations. fixes #60
bmcfee · Apr 5, 2017 · 6c2284b · 6c2284b
2 parents 16807e4 + f99d696
commit 6c2284b
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 26 deletions.
diff --git a/pumpp/feature/base.py b/pumpp/feature/base.py
@@ -3,7 +3,7 @@
 '''Feature extraction base class'''
 
 import numpy as np
-import librosa
+from librosa import resample, time_to_frames
 
 from ..base import Scope
 from ..exceptions import ParameterError
@@ -38,7 +38,8 @@ def __init__(self, name, sr, hop_length, conv=None):
 
         if conv not in ('tf', 'th', 'channels_last', 'channels_first', None):
             raise ParameterError('conv="{}", must be one of '
-                                 '("channels_last", "tf", "channels_first", "th", None)'.format(conv))
+                                 '("channels_last", "tf", '
+                                 '"channels_first", "th", None)'.format(conv))
 
         self.sr = sr
         self.hop_length = hop_length
@@ -88,7 +89,7 @@ def transform(self, y, sr):
         transform_audio
         '''
         if sr != self.sr:
-            y = librosa.resample(y, sr, self.sr)
+            y = resample(y, sr, self.sr)
 
         return self.merge([self.transform_audio(y)])
 
@@ -144,3 +145,21 @@ def layers(self):
                            dtype=self.fields[key].dtype)
 
         return L
+
+    def n_frames(self, duration):
+        '''Get the number of frames for a given duration
+
+        Parameters
+        ----------
+        duration : number >= 0
+            The duration, in seconds
+
+        Returns
+        -------
+        n_frames : int >= 0
+            The number of frames at this extractor's sampling rate and
+            hop length
+        '''
+
+        return int(time_to_frames(duration, sr=self.sr,
+                                  hop_length=self.hop_length))
diff --git a/pumpp/feature/cqt.py b/pumpp/feature/cqt.py
@@ -2,7 +2,9 @@
 '''CQT features'''
 
 import numpy as np
-from librosa import cqt, magphase, note_to_hz, amplitude_to_db
+from librosa import cqt, magphase, note_to_hz
+from librosa import amplitude_to_db, get_duration
+from librosa.util import fix_length
 
 from .base import FeatureExtractor
 from ..exceptions import ParameterError
@@ -73,13 +75,16 @@ def transform_audio(self, y):
             data['phase']: np.ndarray, shape = mag.shape
                 The CQT phase
         '''
-        cqtm, phase = magphase(cqt(y=y,
-                                   sr=self.sr,
-                                   hop_length=self.hop_length,
-                                   fmin=self.fmin,
-                                   n_bins=(self.n_octaves *
-                                           self.over_sample * 12),
-                                   bins_per_octave=(self.over_sample * 12)))
+        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
+
+        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
+                fmin=self.fmin,
+                n_bins=(self.n_octaves * self.over_sample * 12),
+                bins_per_octave=(self.over_sample * 12))
+
+        C = fix_length(C, n_frames)
+
+        cqtm, phase = magphase(C)
         if self.log:
             cqtm = amplitude_to_db(cqtm, ref=np.max)
 
@@ -242,14 +247,17 @@ def transform_audio(self, y):
         '''
         cqtm, phase = [], []
 
+        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
+
         for h in self.harmonics:
-            C, P = magphase(cqt(y=y,
-                                sr=self.sr,
-                                hop_length=self.hop_length,
-                                fmin=self.fmin * h,
-                                n_bins=(self.n_octaves *
-                                        self.over_sample * 12),
-                                bins_per_octave=(self.over_sample * 12)))
+            C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
+                    fmin=self.fmin * h,
+                    n_bins=(self.n_octaves * self.over_sample * 12),
+                    bins_per_octave=(self.over_sample * 12))
+
+            C = fix_length(C, n_frames)
+
+            C, P = magphase(C)
             if self.log:
                 C = amplitude_to_db(C, ref=np.max)
             cqtm.append(C)

diff --git a/pumpp/feature/fft.py b/pumpp/feature/fft.py
@@ -2,7 +2,9 @@
 """STFT feature extractors"""
 
 import numpy as np
-import librosa
+from librosa import stft, magphase
+from librosa import amplitude_to_db, get_duration
+from librosa.util import fix_length
 
 from .base import FeatureExtractor
 
@@ -63,12 +65,16 @@ def transform_audio(self, y):
             data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                 STFT phase
         '''
-        mag, phase = librosa.magphase(librosa.stft(y,
-                                                   hop_length=self.hop_length,
-                                                   n_fft=self.n_fft,
-                                                   dtype=np.float32))
+        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
+
+        D = stft(y, hop_length=self.hop_length,
+                 n_fft=self.n_fft, dtype=np.float32)
+
+        D = fix_length(D, n_frames)
+
+        mag, phase = magphase(D)
         if self.log:
-            mag = librosa.amplitude_to_db(mag, ref=np.max)
+            mag = amplitude_to_db(mag, ref=np.max)
 
         return {'mag': mag.T[self.idx],
                 'phase': np.angle(phase.T)[self.idx]}

diff --git a/pumpp/feature/mel.py b/pumpp/feature/mel.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 from librosa.feature import melspectrogram
+from librosa import amplitude_to_db, get_duration
+from librosa.util import fix_length
 
 from .base import FeatureExtractor
 
@@ -63,12 +65,17 @@ def transform_audio(self, y):
             data['mag'] : np.ndarray, shape=(n_frames, n_mels)
                 The Mel spectrogram
         '''
+        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
+
         mel = np.sqrt(melspectrogram(y=y, sr=self.sr,
                                      n_fft=self.n_fft,
                                      hop_length=self.hop_length,
                                      n_mels=self.n_mels,
                                      fmax=self.fmax)).astype(np.float32)
+
+        mel = fix_length(mel, n_frames)
+
         if self.log:
-            mel = librosa.amplitude_to_db(mel, ref=np.max)
+            mel = amplitude_to_db(mel, ref=np.max)
 
         return {'mag': mel.T[self.idx]}
diff --git a/pumpp/feature/rhythm.py b/pumpp/feature/rhythm.py
@@ -4,6 +4,8 @@
 import numpy as np
 from librosa import fmt
 from librosa.feature import tempogram
+from librosa import get_duration
+from librosa.util import fix_length
 
 from .base import FeatureExtractor
 
@@ -48,10 +50,13 @@ def transform_audio(self, y):
             data['tempogram'] : np.ndarray, shape=(n_frames, win_length)
                 The tempogram
         '''
+        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))
+
         tgram = tempogram(y=y, sr=self.sr,
                           hop_length=self.hop_length,
                           win_length=self.win_length).astype(np.float32)
 
+        tgram = fix_length(tgram, n_frames)
         return {'tempogram': tgram.T[self.idx]}
 
 

diff --git a/pumpp/task/base.py b/pumpp/task/base.py
@@ -148,7 +148,6 @@ def encode_events(self, duration, events, values, dtype=np.bool):
         target : ndarray, shape=(n_frames, n_values)
         '''
 
-        # FIXME: support sparse encoding
         frames = time_to_frames(events, sr=self.sr,
                                 hop_length=self.hop_length)