add slaney normalization (pytorch#589)

* add slaney normalization. * add torchscript. * convert to string for torchscript compatibility. * flake8. * use string as default.
bhargavkathivarapu · May 19, 2020 · 4add6b1 · 4add6b1
1 parent cf528fd
commit 4add6b1
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 5 deletions.
diff --git a/test/test_librosa_compatibility.py b/test/test_librosa_compatibility.py
@@ -53,19 +53,20 @@ def test_griffinlim(self):
 
         torch.testing.assert_allclose(ta_out, lr_out, atol=5e-5, rtol=1e-5)
 
-    def _test_create_fb(self, n_mels=40, sample_rate=22050, n_fft=2048, fmin=0.0, fmax=8000.0):
+    def _test_create_fb(self, n_mels=40, sample_rate=22050, n_fft=2048, fmin=0.0, fmax=8000.0, norm=None):
         librosa_fb = librosa.filters.mel(sr=sample_rate,
                                          n_fft=n_fft,
                                          n_mels=n_mels,
                                          fmax=fmax,
                                          fmin=fmin,
                                          htk=True,
-                                         norm=None)
+                                         norm=norm)
         fb = F.create_fb_matrix(sample_rate=sample_rate,
                                 n_mels=n_mels,
                                 f_max=fmax,
                                 f_min=fmin,
-                                n_freqs=(n_fft // 2 + 1))
+                                n_freqs=(n_fft // 2 + 1),
+                                norm=norm)
 
         for i_mel_bank in range(n_mels):
             torch.testing.assert_allclose(fb[:, i_mel_bank], torch.tensor(librosa_fb[i_mel_bank]),
@@ -79,6 +80,12 @@ def test_create_fb(self):
         self._test_create_fb(n_mels=56, fmin=800.0, fmax=900.0)
         self._test_create_fb(n_mels=56, fmin=1900.0, fmax=900.0)
         self._test_create_fb(n_mels=10, fmin=1900.0, fmax=900.0)
+        self._test_create_fb(n_mels=128, sample_rate=44100, norm="slaney")
+        self._test_create_fb(n_mels=128, fmin=2000.0, fmax=5000.0, norm="slaney")
+        self._test_create_fb(n_mels=56, fmin=100.0, fmax=9000.0, norm="slaney")
+        self._test_create_fb(n_mels=56, fmin=800.0, fmax=900.0, norm="slaney")
+        self._test_create_fb(n_mels=56, fmin=1900.0, fmax=900.0, norm="slaney")
+        self._test_create_fb(n_mels=10, fmin=1900.0, fmax=900.0, norm="slaney")
 
     def test_amplitude_to_DB(self):
         spec = torch.rand((6, 201))

diff --git a/test/test_torchscript_consistency.py b/test/test_torchscript_consistency.py
@@ -96,7 +96,8 @@ def func(_):
             f_max = 20.0
             n_mels = 10
             sample_rate = 16000
-            return F.create_fb_matrix(n_stft, f_min, f_max, n_mels, sample_rate)
+            norm = ""
+            return F.create_fb_matrix(n_stft, f_min, f_max, n_mels, sample_rate, norm)
 
         dummy = torch.zeros(1, 1)
         self._assert_consistency(func, dummy)

diff --git a/torchaudio/functional.py b/torchaudio/functional.py
@@ -335,7 +335,8 @@ def create_fb_matrix(
         f_min: float,
         f_max: float,
         n_mels: int,
-        sample_rate: int
+        sample_rate: int,
+        norm: str = "",
 ) -> Tensor:
     r"""Create a frequency bin conversion matrix.
 
@@ -345,6 +346,8 @@ def create_fb_matrix(
         f_max (float): Maximum frequency (Hz)
         n_mels (int): Number of mel filterbanks
         sample_rate (int): Sample rate of the audio waveform
+        norm (str): If 'slaney', divide the triangular mel weights by the width of the mel band
+        (area normalization). (Default: '')
 
     Returns:
         Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
@@ -372,6 +375,12 @@ def create_fb_matrix(
     down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
     up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
     fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2:n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+
     return fb