Avant toute chose, aller dans Exécution>Modifier le type d'exécution>Accélérateur matériel>GPU

# 1. Construction d'un spectrogramme

## 1.1. Pre Processing

Le son en question : https://soundcloud.com/cl-ment-gousseau-1/pianoscale (quelques notes de piano)

In [0]:
# téléchargement
!pip install scdl 
!scdl -l https://soundcloud.com/cl-ment-gousseau-1/pianoscale
  
# conversion en .wav
import subprocess
command = "ffmpeg -i piano.mp3 -ab 160k -ac 2 -ar 44100 -vn piano.wav"
subprocess.call(command, shell=True)

In [0]:
# transformation du fichier .wav en fichier .npy
import numpy as np
!pip install soundfile 
import soundfile as sf
!pip install librosa
import librosa

def get_sound_data(path, sr=16000):
    data, fsr = sf.read(path)
    data_resample = librosa.resample(data.T, fsr, sr)
    if len(data_resample.shape) > 1:
        data_resample = np.average(data_resample, axis=0)
    return data_resample, sr

path='piano.wav'
signal, sampling_rate = get_sound_data(path)

In [0]:
import matplotlib.pyplot as plt
print('sampling rate = '+str(sampling_rate)+' Hz')
plt.plot(signal)
plt.xlabel('time')
plt.ylabel('amplitude')

## 1.2. Construction des spectrogrammes

Calculer les melspectrogrammes avec la librairie librosa : 

* voir https://librosa.github.io/librosa/generated/librosa.feature.melspectrogram.html#librosa.feature.melspectrogram

In [0]:
melspec = ... # à completer

plt.imshow(melspec)
plt.xlabel('time')
plt.ylabel('frequency')
plt.colorbar()
plt.show()

Utiliser un échelle log pour les amplitudes :
* voir https://librosa.github.io/librosa/generated/librosa.core.amplitude_to_db


In [0]:
logmelspec = ... # à completer

plt.imshow(logmelspec)
plt.xlabel('time')
plt.ylabel('frequency')
plt.colorbar()
plt.show()

Renverser l'image pour que les basses fréquences soient en bas de l'image :

In [0]:
logmelspec = ... # à compléter

plt.imshow(logmelspec)
plt.xlabel('time')
plt.ylabel('frequency')
plt.colorbar()
plt.show()

# 2. Classification

## 2.1. Importation des données

Ce sont des sons de la base de données UrbanSound8K : 

Il y a 8372 sons de moins de 4 secondes appartenant aux classes :
* 0 = air_conditioner
* 1 = car_horn
* 2 = children_playing
* 3 = dog_bark
* 4 = drilling (=forage)
* 5 = engine_idling
* 6 = gun_shot
* 7 = jackhammer (=marteau-piqueur)
* 8 = siren
* 9 = street_music

1 son appartient à une et une seule classe

In [0]:
# To load a subset of the dataset
!wget  https://zenodo.org/record/3354445/files/fold1.zip
!unzip fold1.zip
  
# To load the full dataset
#!wget https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
#!gzip -d < UrbanSound8K.tar.gz | tar xvf -

## 2.1. Calcul des logmel-spectrogrammes

Les fichiers audios sont découpés en fenêtres de 1s. Le logmel-spectrogramme est calculé pour chaque fenêtre.

In [0]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += window_size 
        
def compute_logmelspec(file_names, bands=64, hop_length=161, n_fft=1024):
    
    window_size = 16000  
    log_specgrams_full = []
    log_specgrams_hp = []
    class_labels = []
    
    progress=float(0)
    nfiles=len(file_names)
    
    # for each audio sample
    for fn in file_names:
        
        if np.mod(progress,50)==0:
          print(str(np.round(100*progress/nfiles))+str('% completed'))
        progress=progress+1
        
        file_name = fn.split('\\')[-1]
           
        class_label = file_name.split('-')[1]
        sound_data, sr = get_sound_data(fn, sr=16000)

        # for each audio signal sub-sample window of data
        for (start,end) in windows(sound_data, window_size):
            if(len(sound_data[start:end]) == window_size):
                signal = sound_data[start:end]
                # get the log-scaled mel-spectrogram
                melspec_full = librosa.feature.melspectrogram(signal, n_mels = bands, hop_length=hop_length, n_fft=n_fft)
                logspec_full = librosa.amplitude_to_db(melspec_full)
  
                #print(logspec_full.shape)

                log_specgrams_full.append(logspec_full)
                class_labels.append(class_label)

    log_specgrams_full = np.asarray(log_specgrams_full)
    log_specgrams_full = log_specgrams_full.reshape(log_specgrams_full.shape[0],log_specgrams_full.shape[1],log_specgrams_full.shape[2], 1)
    log_specgrams_full = np.flip(log_specgrams_full,axis=1)
    
    return np.array(log_specgrams_full), np.array(class_labels, dtype = np.int)

Pour que cela prenne moins de temps, on ne réalise cela que sur un sous-ensemble du jeu de données: fold1

In [0]:
import glob

#subset
files = glob.glob('fold1/*')

#full dataset
#files = glob.glob('UrbanSound8K/audio//fold1/*')

logmelspec, labels = compute_logmelspec(files)

## 2.3. Visualisation de quelques spectrogrammes

In [0]:
class_map = {'0' : 'air_conditioner', '1' : 'car_horn', '2' : 'children_playing', '3' : 'dog_bark', '4' : 'drilling', 
             '5' : 'engine_idling', '6' : 'gun_shot', '7' : 'jackhammer', '8' : 'siren', '9' : 'street_music'}

categories = list(set(labels))
sample_idxs = [np.where(labels == label_id)[0][:5] for label_id in categories]

plt.figure(figsize=(24, 10))
for i in range(10):
  for j in range(5):
    plt.subplot(5,10,1+i+10*j)
    plt.imshow(logmelspec[sample_idxs[i][j],:,:,0], cmap='viridis')
    plt.title(class_map[str(i)])
plt.tight_layout()

## 2.4. Création d'un réseaux de neurones (de type VGGish)

In [0]:
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from keras.models import Model

def create_model():
  
    input_shape = (64,100,1)

    img_input = Input(shape=input_shape)
    
    x = BatchNormalization()(img_input)
	
    x = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv1')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool1')(x)
  
    x = Flatten()(x)
    x = Dense(10, activation='softmax', name='fc2')(x)

    model = Model(img_input, x, name='model')
    
    return model

## 2.5. One-hot encoding + Création d'une jeu d'entraînement, validation, test

In [0]:
# One-hot encoding
from keras.utils import to_categorical
y=to_categorical(labels)

# Test: 10% des données; Train: 80% des 90% restants; Validation: 20% des 90% restants
from sklearn.model_selection import train_test_split
xtrainval, xtest, ytrainval, ytest = train_test_split(logmelspec, y, test_size=0.1)
xtrain, xval, ytrain, yval = train_test_split(xtrainval, ytrainval, test_size=0.2)

In [0]:
xtrain.shape

## 2.6. Entraînement

In [0]:
model=create_model()

model.summary()

from keras.optimizers import Adam
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.0001),metrics=['accuracy'])
history = model.fit(xtrain,ytrain,validation_data=(xval,yval),batch_size=32,epochs=50,verbose=1)

## 2.7. Visualisation des courbes d'apprentissage

In [0]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
t = f.suptitle('Deep Neural Net Performance', fontsize=12)
f.subplots_adjust(top=0.85, wspace=0.2)

epochs = list(range(1,51))
ax1.plot(history.history['acc'], label='Train Accuracy')
ax1.plot(history.history['val_acc'], label='Validation Accuracy')
ax1.set_ylabel('Accuracy Value')
ax1.set_xlabel('Epoch')
ax1.set_title('Accuracy')
ax1.grid(True)
l1 = ax1.legend(loc="best")

ax2.plot(history.history['loss'], label='Train Loss')
ax2.plot(history.history['val_loss'], label='Validation Loss')
ax2.set_ylabel('Loss Value')
ax2.set_xlabel('Epoch')
ax2.set_title('Loss')
ax2.grid(True)
l2 = ax2.legend(loc="best")

## 2.8. Résultats et matrices de confusion

In [0]:
_,acc = model.evaluate(xtest,ytest)
print('accuracy = '+str(acc))

In [0]:
predictions=model.predict(xtest)

from sklearn.metrics import confusion_matrix

labels=['air_conditioner','car_horn','children_playing','dog_bark','drilling','engine_idling','gun_shot','jackhammer','siren','street_music']

m=confusion_matrix(ytest.argmax(axis=1), predictions.argmax(axis=1))
cm=m/np.sum(m,axis=0)

fig, ax = plt.subplots(figsize=(10,10))

im = ax.imshow(cm)
ax.figure.colorbar(im, ax=ax)

ax.set(xticks=np.arange(cm.shape[1]),yticks=np.arange(cm.shape[0]),xticklabels=labels, yticklabels=labels,title='normalized confusion matrix',ylabel='True label',xlabel='Predicted label')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor")

plt.show()

## 2.9. Merci