# Extracción y descripción de los datos

Empiezo importando las librerías que voy a utilizar.

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd

import librosa.display
import librosa

from glob import glob

import ffmpeg

import os

from pydub import AudioSegment
from pydub.utils import make_chunks

import re

Continuo creando 3 funciones con las que voy a hacer los siguiente:
 - La primera divide un archivo audio de larga duración en pequeños audios de 10 segundos de duración, y los guarda en la carpeta que le he especificado.
 - La segunda accede a cada uno de esos audios de 10 segundos y saca las features características de dicho archivo, y en base a su nombre le asigna una categoría y otra.
 - La tercera simplemente me enseña las principales características y métricas de la base de datos.

Finalmente, guardo el el dataset en mi carpeta como un CSV.

Dado que la base de datos ya la tengo creada, este JN lo voy a aplicar sobre audios que luego voy a utilizar para predecir con mi modelo y ver si es lo suficientemente bueno.

In [9]:
def decompose_files(data_dir, audio_files):
    for j in range(len(audio_files)):
        myaudio = AudioSegment.from_file(data_dir + '{}'.format(os.listdir(data_dir)[j])) 
        chunk_length_ms = 10000
        chunks = make_chunks(myaudio, chunk_length_ms)

        for i, chunk in enumerate(chunks):
            chunk_name = "{}{}.wav".format(os.listdir(data_dir)[j], i)
            #print("exporting", chunk_name)
            chunk.export('./audios/base/{}'.format(chunk_name), format="mp3")
            
decompose_files('./audios/audios_base/', glob('./audios/audios_base' + '/*'))

In [10]:
def get_features(data_dir, audio_files):
    momentos = {'ducha': 'Ducha', 'cena': 'Cena', 'washing': 'Lavadora', 'vacuum': 'Aspiradora', 
                'shaver': 'Afeitadora', 'hairdryer': 'Secador_pelo', 'airconditioner': 'Aire_acondicionado', 
                'cellphone': 'Telefono', 'comp': 'Tecleo', 'silence': 'Silencio', 'dryer': 'Secadora', 
                'blender': 'Licuadora', 'doorbell': 'Timbre', 'alarm': 'Alarma', 'faucet': 'Grifo', 
                'microwave': 'Microondas'}
    features = []
    for i in range(len(audio_files)):
        y, sr = librosa.load(audio_files[i])
        name = os.listdir(data_dir)[i]
        pattern = "[._][\w]+"
        name = re.sub(pattern, '', name)
        if name in momentos.keys():
            momento = momentos.get(name)
        else:
            momento = 'Otro'
        mfcc = np.ndarray.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20))
        scem = np.ndarray.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        scom = np.ndarray.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr))
        srom = np.ndarray.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        sbwm = np.ndarray.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        tempo = librosa.beat.tempo(onset_envelope=librosa.onset.onset_strength(y=y, sr=sr, hop_length=512), sr=sr, hop_length=512)[0]
        rmse = np.ndarray.mean(librosa.feature.rms(y=y))
        features.append([mfcc, scem, scom, srom, sbwm, tempo, rmse, momento])
    return pd.DataFrame(features, columns=['mfcc', 'scem','scom', 'srom','sbwm', 'tempo', 'rmse', 'momento'])

data = get_features('./audios/base/', glob('./audios/base' + '/*.wav'))

In [11]:
def show_df_info(dataframe):
    display("The type of the dataframe is {}.".format(type(dataframe)))
    print("------------------------------------------------------------------------")
    display("The amount of entries are {}.".format(dataframe.size))
    print("------------------------------------------------------------------------")
    display("Its shape is {}.".format(dataframe.shape))
    print("------------------------------------------------------------------------")
    display("Its features are: {}.".format(list(dataframe)))
    print("------------------------------------------------------------------------")
    print("The data types of columns are: {}".format(dataframe.dtypes))
    print("------------------------------------------------------------------------")
    print("The counter of each class is: {}".format(dataframe.momento.value_counts()))
    print("------------------------------------------------------------------------")
    print('The amount of null values is: {}.'.format(dataframe.isna().sum()))
    display(dataframe.head())
    display(dataframe.describe())
    
show_df_info(data)

"The type of the dataframe is <class 'pandas.core.frame.DataFrame'>."

------------------------------------------------------------------------


'The amount of entries are 736.'

------------------------------------------------------------------------


'Its shape is (92, 8).'

------------------------------------------------------------------------


"Its features are: ['mfcc', 'scem', 'scom', 'srom', 'sbwm', 'tempo', 'rmse', 'momento']."

------------------------------------------------------------------------
The data types of columns are: mfcc       float64
scem       float64
scom       float64
srom       float64
sbwm       float64
tempo      float64
rmse       float64
momento     object
dtype: object
------------------------------------------------------------------------
The counter of each class is: Grifo                 10
Secador_pelo           9
Timbre                 6
Microondas             6
Licuadora              6
Aire_acondicionado     6
Lavadora               6
Telefono               6
Aspiradora             6
Secadora               6
Afeitadora             6
Ducha                  6
Tecleo                 6
Alarma                 4
Silencio               3
Name: momento, dtype: int64
------------------------------------------------------------------------
The amount of null values is: mfcc       0
scem       0
scom       0
srom       0
sbwm       0
tempo      0
rmse       0
momento    0
dtype: int64.


Unnamed: 0,mfcc,scem,scom,srom,sbwm,tempo,rmse,momento
0,-1.606872,4092.840687,19.242309,7751.678339,3074.835521,129.199219,0.156137,Aspiradora
1,5.749998,673.601633,17.140124,1070.265145,1533.607014,129.199219,0.175553,Aire_acondicionado
2,-56.568542,0.0,0.0,0.0,0.0,120.0,0.0,Telefono
3,-12.618124,1806.021749,19.141861,4382.006836,2623.35195,120.0,0.002939,Tecleo
4,-12.631319,4496.352321,16.700788,8196.106665,2977.760301,112.347147,0.029217,Tecleo


Unnamed: 0,mfcc,scem,scom,srom,sbwm,tempo,rmse
count,92.0,92.0,92.0,92.0,92.0,92.0,92.0
mean,-12.494613,2713.37215,17.098056,5001.558916,2206.262771,122.78098,0.084017
std,15.995727,1750.948107,5.677589,3109.396465,943.235342,17.564406,0.080995
min,-56.568542,0.0,0.0,0.0,0.0,60.09266,0.0
25%,-15.382455,1221.964546,17.039904,1999.971182,1776.486005,120.0,0.020813
50%,-10.280765,2751.990089,18.100998,5560.712392,2495.144493,120.0,0.040981
75%,-2.799995,4189.06825,19.231767,7762.513637,2979.619085,129.199219,0.150365
max,8.49985,6201.793979,28.704468,9590.843707,3293.596694,184.570312,0.288248


In [12]:
#data.to_csv('data.csv', index=False)