# Extracción y descripción de los datos

Empiezo importando las librerías que voy a utilizar.

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd

import librosa.display
import librosa

from glob import glob

import ffmpeg

import os

from pydub import AudioSegment
from pydub.utils import make_chunks

import re

Continuo creando 3 funciones con las que voy a hacer los siguiente:
 - La primera divide un archivo audio de larga duración en pequeños audios de 10 segundos de duración, y los guarda en la carpeta que le he especificado.
 - La segunda accede a cada uno de esos audios de 10 segundos y saca las features características de dicho archivo, y en base a su nombre le asigna una categoría y otra.
 - La tercera simplemente me enseña las principales características y métricas de la base de datos.

Finalmente, guardo el el dataset en mi carpeta como un CSV.

Dado que la base de datos ya la tengo creada, este JN lo voy a aplicar sobre audios que luego voy a utilizar para predecir con mi modelo y ver si es lo suficientemente bueno.

In [3]:
def decompose_files(data_dir, audio_files):
    for j in range(len(audio_files)):
        myaudio = AudioSegment.from_file(data_dir + '{}'.format(os.listdir(data_dir)[j])) 
        chunk_length_ms = 10000
        chunks = make_chunks(myaudio, chunk_length_ms)

        for i, chunk in enumerate(chunks):
            chunk_name = "{}{}.wav".format(os.listdir(data_dir)[j], i)
            print("exporting", chunk_name)
            chunk.export('./sounds2/{}'.format(chunk_name), format="mp3")
            
decompose_files('./sounds/', glob('./sounds' + '/*'))

In [5]:
def get_features(data_dir, audio_files):
    momentos = {'ducha': 'Ducha', 'cena': 'Cena', 'washing': 'Lavadora', 'vacuum': 'Aspiradora', 
                'shaver': 'Afeitadora', 'hairdryer': 'Secador_pelo', 'airconditioner': 'Aire_acondicionado', 
                'cellphone': 'Telefono', 'comp': 'Tecleo', 'silence': 'Silencio', 'dryer': 'Secadora', 
                'blender': 'Licuadora', 'doorbell': 'Timbre', 'alarm': 'Alarma', 'faucet': 'Grifo', 
                'microwave': 'Microondas'}
    features = []
    for i in range(len(audio_files)):
        y, sr = librosa.load(audio_files[i])
        name = os.listdir(data_dir)[i]
        pattern = "[._][\w]+"
        name = re.sub(pattern, '', name)
        if name in momentos.keys():
            momento = momentos.get(name)
        else:
            momento = 'Otro'
        mfcc = np.ndarray.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20))
        scem = np.ndarray.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        scom = np.ndarray.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr))
        srom = np.ndarray.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        sbwm = np.ndarray.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        tempo = librosa.beat.tempo(onset_envelope=librosa.onset.onset_strength(y=y, sr=sr, hop_length=512), sr=sr, hop_length=512)[0]
        rmse = np.ndarray.mean(librosa.feature.rms(y=y))
        features.append([mfcc, scem, scom, srom, sbwm, tempo, rmse, momento])
    return pd.DataFrame(features, columns=['mfcc', 'scem','scom', 'srom','sbwm', 'tempo', 'rmse', 'momento'])

data = get_features('./sounds2/', glob('./sounds2' + '/*.wav'))

In [13]:
def show_df_info(dataframe):
    display("The type of the dataframe is {}.".format(type(dataframe)))
    print("------------------------------------------------------------------------")
    display("The amount of entries are {}.".format(dataframe.size))
    print("------------------------------------------------------------------------")
    display("Its shape is {}.".format(dataframe.shape))
    print("------------------------------------------------------------------------")
    display("Its features are: {}.".format(list(dataframe)))
    print("------------------------------------------------------------------------")
    print("The data types of columns are: {}".format(dataframe.dtypes))
    print("------------------------------------------------------------------------")
    print("The counter of each class is: {}".format(dataframe.momento.value_counts()))
    print("------------------------------------------------------------------------")
    print('The amount of null values is: {}.'.format(dataframe.isna().sum()))
    display(dataframe.head())
    display(dataframe.describe())
    
show_df_info(data)

"The type of the dataframe is <class 'pandas.core.frame.DataFrame'>."

------------------------------------------------------------------------


'The amount of entries are 152.'

------------------------------------------------------------------------


'Its shape is (19, 8).'

------------------------------------------------------------------------


"Its features are: ['mfcc', 'scem', 'scom', 'srom', 'sbwm', 'tempo', 'rmse', 'momento']."

------------------------------------------------------------------------
The data types of columns are: mfcc       float64
scem       float64
scom       float64
srom       float64
sbwm       float64
tempo      float64
rmse       float64
momento     object
dtype: object
------------------------------------------------------------------------
The counter of each class is: Secadora              2
Cena                  2
Ducha                 2
Microondas            1
Licuadora             1
Telefono              1
Grifo                 1
Timbre                1
Aspiradora            1
Tecleo                1
Silencio              1
Afeitadora            1
Aire_acondicionado    1
Lavadora              1
Secador_pelo          1
Alarma                1
Name: momento, dtype: int64
------------------------------------------------------------------------
The amount of null values is: mfcc       0
scem       0
scom       0
srom       0
sbwm       0
tempo      0
rmse       0
momento    0
dtype: i

Unnamed: 0,mfcc,scem,scom,srom,sbwm,tempo,rmse,momento
0,-5.617919,4822.729698,17.012424,8490.626926,2922.254333,129.199219,0.056099,Grifo
1,-2.063152,452.804643,17.91553,811.142272,535.833205,129.199219,0.084744,Aire_acondicionado
2,-25.202837,1554.105964,27.932041,3095.535342,1372.209905,151.999081,0.035863,Cena
3,4.669409,669.271532,16.381503,727.032881,1716.868434,117.453835,0.143639,Secadora
4,-1.508489,1186.695657,18.275804,2155.318754,2039.238339,107.666016,0.149006,Secadora


Unnamed: 0,mfcc,scem,scom,srom,sbwm,tempo,rmse
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,-10.576888,2630.655752,20.145336,4866.697177,2147.428201,123.698443,0.083227
std,10.589944,1616.479853,4.110418,2870.345198,782.515287,9.527361,0.058889
min,-27.696898,452.804643,15.264916,727.032881,535.833205,107.666016,0.000652
25%,-20.463392,1307.959612,16.838389,2449.976407,1511.457576,117.453835,0.034423
50%,-9.622233,2717.740563,18.929312,5360.468591,2341.296911,123.046875,0.074781
75%,-2.979991,3366.125937,21.761841,7160.014864,2745.517833,129.199219,0.131004
max,8.228639,5980.407268,28.182421,9110.993012,3325.945286,151.999081,0.21359


In [11]:
data.to_csv('data_prueba.csv', index=False)