This notebooks contains some ideas for how to make the preprocessing and feature extraction pipeline easier to reason about

# Imports

Set sklearn to output component diagram visualization

In [1]:
from sklearn import set_config; set_config(display='diagram')

Import the pipeline building classes from sklearn

In [29]:
# from sklearn.pipeline import make_pipeline
# Instead of make_pipeline, we can use the Pipeline class directly.
# It involves a bit more typing, but we can name each step, and understand the params dictionary more easily
from sklearn.pipeline import Pipeline

# from sklearn.pipeline import make_union
# I'm not sure if we need make_union

from sklearn.compose import make_column_transformer

# from sklearn.compose import make_column_selector
# we are applying the same transforms to the entire dataset, 
# so I don't think we need a column_selecter

from sklearn.preprocessing import FunctionTransformer
# for wrapping the stft, mel filterbank and bandpass filter, mfcc

from sklearn.preprocessing import StandardScaler
# for scaling the features, not sure in which stage to apply this

from scipy.signal import stft 
# for taking the Short Time Fourier Transform of the input signals 
# If we end up doing the scaling after computing the mfcc, we don't 
# need this because we can just pass the sample timeseries directly to 
# the mfcc function

# from scipy.signal import butter
# for bandpass filtering the signal
# In the end we don't need this because we can pass 
# fmin= and fmax= keyword arguments to the mfcc function

from librosa.feature import mfcc, melspectrogram
from librosa.core import power_to_db

import numpy as np

from us8kdata.loader import UrbanSound8K

# The input data
The input to the pipeline will be the sample arrays received from `us8kdata.loader.UrbanSound8K`

```
   data = UrbanSound8K('/path/to/data_root')
   training_folds = [1,2,3,4,5,6,7]
   val_folds = [8, 9]
   test_fold = [10]
   X_train = [samples for samples in data.fold_audio_generator(fold=training_folds)]
   X_train.shape
   # (n_files_in_folds, n_samples_in_file)
   
```


In [30]:
data = UrbanSound8K('../hasp/data')

In [89]:
train_fold = [samples for samples in data.fold_audio_generator(fold=[1, 2])]#, 3, 6, 7, 8, 9, 10])]
val_fold = [samples for samples in data.fold_audio_generator(fold=[4])]
test_fold = [samples for samples in data.fold_audio_generator(fold=[5])]


In [91]:
train_y = data.filter_metadata(fold=[1, 2, 3, 6, 7, 8, 9, 10]).classID
val_y = data.filter_metadata(fold=[4]).classID
test_y = data.filter_metadata(fold=[5]).classID

# MFCC
The librosa MFCC function can take either an array of audio samples using the keyword argument y=, or an array of spectrograms using the keyword argument S= as its input.
However, the pipeline has no way of specifying which argument we are using, so we must define a wrapper function that passes the input in the right way

In [108]:
def samples_to_mean_mfcc(examples, sr=16000, n_fft=512, hop_length=128, fmin=0.0, fmax=8000, **kwargs):
    
    # to prevent trying to take the lo
    return np.array([mfcc(y=sample, sr=sr, n_fft=n_fft, n_mels=100, hop_length=128, fmin=0.0, fmax=8000, **kwargs).mean(axis=1) for sample in examples], 
                    dtype=np.float32)

In [109]:
#MFCC (librosa), params: n_coeff

mean_mfcc_feat = FunctionTransformer(samples_to_mean_mfcc, 
                                     kw_args={'sr':16000, 'n_mfcc':20, 
                                              'n_fft': 512, 'hop_length': 128, 
                                              'fmin': 0.0, 'fmax': None})

In [62]:
mean_mfcc_feat

In [63]:
mean_mfcc_feat.get_params()

{'accept_sparse': False,
 'check_inverse': True,
 'feature_names_out': None,
 'func': <function __main__.samples_to_mean_mfcc(examples, sr=16000, n_fft=512, hop_length=128, fmin=0.0, fmax=None, **kwargs)>,
 'inv_kw_args': None,
 'inverse_func': None,
 'kw_args': {'sr': 16000,
  'n_mfcc': 20,
  'n_fft': 512,
  'hop_length': 128,
  'fmin': 0.0,
  'fmax': None},
 'validate': False}

# Compose the pipeline

In [110]:
pipe = Pipeline(
    [
        ('mean_mfcc', mean_mfcc_feat),
        ('scaler', StandardScaler())
    ]
)

In [65]:
pipe.get_params()

{'memory': None,
 'steps': [('mean_mfcc',
   FunctionTransformer(func=<function samples_to_mean_mfcc at 0x12da4b700>,
                       kw_args={'fmax': None, 'fmin': 0.0, 'hop_length': 128,
                                'n_fft': 512, 'n_mfcc': 20, 'sr': 16000})),
  ('scaler', StandardScaler())],
 'verbose': False,
 'mean_mfcc': FunctionTransformer(func=<function samples_to_mean_mfcc at 0x12da4b700>,
                     kw_args={'fmax': None, 'fmin': 0.0, 'hop_length': 128,
                              'n_fft': 512, 'n_mfcc': 20, 'sr': 16000}),
 'scaler': StandardScaler(),
 'mean_mfcc__accept_sparse': False,
 'mean_mfcc__check_inverse': True,
 'mean_mfcc__feature_names_out': None,
 'mean_mfcc__func': <function __main__.samples_to_mean_mfcc(examples, sr=16000, n_fft=512, hop_length=128, fmin=0.0, fmax=None, **kwargs)>,
 'mean_mfcc__inv_kw_args': None,
 'mean_mfcc__inverse_func': None,
 'mean_mfcc__kw_args': {'sr': 16000,
  'n_mfcc': 20,
  'n_fft': 512,
  'hop_length': 128,
  'f

In [116]:
mean_mfcc_grid = []
for n_mfcc in range(13, 26):
    for n_fft in range(256, 2049, 256):
        for fmin in [0.0, 500.0]: #50.0, 100.0, 200.0, 300.0, 400.0, 500.0]:
            for fmax in [None, 5000.0]: #, 4000.0, 3000.0]:
                mean_mfcc_grid.append({'sr':16000, 'n_mfcc': n_mfcc, 'n_fft': n_fft,
                                       'hop_length': 128, 'fmin': fmin, 'fmax': fmax})
len(mean_mfcc_grid)        

416

In [112]:
model_pipe = Pipeline([
    ('preproc', pipe),
    ('model', LogisticRegression())
])
model_pipe

# Test the pipeline

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [120]:
gs = GridSearchCV(model_pipe, dict(preproc__mean_mfcc__kw_args=mean_mfcc_grid[:1]), cv=5, verbose=2, n_jobs=6)

In [119]:
%%time
gs.fit(train_fold, train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return f(*args, **kwargs)
  return f(*args, **kwargs)


[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': None}; total time=  55.8s


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': None}; total time= 1.0min
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': 5000.0}; total time=  36.4s


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': None}; total time= 1.1min
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': 5000.0}; total time=  43.9s
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 500.0, 'fmax': None}; total time=  48.3s
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 500.0, 'fmax': 5000.0}; total time= 1.1min
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': None}; total time=  47.6s
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 0.0, 'fmax': 5000.0}; total time=  35.4s
[CV] END preproc__mean_mfcc__kw_args={'sr': 16000, 'n_mfcc': 13, 'n_fft': 256, 'hop_length': 128, 'fmin': 500.0, 'fmax':

KeyboardInterrupt: 

In [115]:
gs.best_score_

0.4316753358990377

In [13]:
pipe