# Extracción y descripción de los datos

Empiezo importando las librerías que voy a utilizar.

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd

import librosa.display
import librosa

from glob import glob

import ffmpeg

import os

from pydub import AudioSegment
from pydub.utils import make_chunks

import re

Continuo creando 3 funciones con las que voy a hacer los siguiente:
 - La primera divide un archivo audio de larga duración en pequeños audios de 10 segundos de duración, y los guarda en la carpeta que le he especificado.
 - La segunda accede a cada uno de esos audios de 10 segundos y saca las features características de dicho archivo, y en base a su nombre le asigna una categoría y otra.
 - La tercera simplemente me enseña las principales características y métricas de la base de datos.

Finalmente, guardo el el dataset en mi carpeta como un CSV.

Dado que la base de datos ya la tengo creada, este JN lo voy a aplicar sobre audios que luego voy a utilizar para predecir con mi modelo y ver si es lo suficientemente bueno.

In [4]:
def decompose_files(data_dir, audio_files):
    for j in range(len(audio_files)):
        myaudio = AudioSegment.from_file(data_dir + '{}'.format(os.listdir(data_dir)[j])) 
        chunk_length_ms = 2000
        chunks = make_chunks(myaudio, chunk_length_ms)

        for i, chunk in enumerate(chunks):
            chunk_name = "{}{}.wav".format(os.listdir(data_dir)[j], i)
            #print("exporting", chunk_name)
            chunk.export('./audios/fourier_32/{}'.format(chunk_name), format="mp3")
            
decompose_files('./audios/audios_base/', glob('./audios/audios_base' + '/*'))

In [None]:
def get_features(data_dir, audio_files):
    momentos = {'ducha': 'Ducha', 'cena': 'Cena', 'washing': 'Lavadora', 'vacuum': 'Aspiradora', 
                'shaver': 'Afeitadora', 'hairdryer': 'Secador_pelo', 'airconditioner': 'Aire_acondicionado', 
                'cellphone': 'Telefono', 'comp': 'Tecleo', 'silence': 'Silencio', 'dryer': 'Secadora', 
                'blender': 'Licuadora', 'doorbell': 'Timbre', 'alarm': 'Alarma', 'faucet': 'Grifo', 
                'microwave': 'Microondas'}
    features = []
    for i in range(len(audio_files)):
        y, sr = librosa.load(audio_files[i], sr=8000, mono=True)
        name = os.listdir(data_dir)[i]
        pattern = "[._][\w]+"
        name = re.sub(pattern, '', name)
        if name in momentos.keys():
            momento = momentos.get(name)
        else:
            momento = 'Otro'
        mfcc = np.ndarray.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20))
        scem = np.ndarray.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        scom = np.ndarray.mean(librosa.feature.spectral_contrast(S=np.abs(librosa.stft(y)), sr=sr, n_bands=4))
        srom = np.ndarray.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        sbwm = np.ndarray.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        tempo = librosa.beat.tempo(onset_envelope=librosa.onset.onset_strength(y=y, sr=sr, hop_length=512), sr=sr, hop_length=512)[0]
        rmse = np.ndarray.mean(librosa.feature.rms(y=y))
        D = np.abs(np.fft.fft(y, n=64)[:8000 // 2])
        features.append([mfcc, scem, scom, srom, sbwm, tempo, rmse, 
                         D[0], D[1], D[2], D[3], D[4], D[5], 
                         D[6], D[7], D[8], D[9], D[10], D[11], 
                         D[12], D[13], D[14], D[15], D[16], D[17], 
                         D[18], D[19], D[20], D[21], D[22], D[23],
                         D[24], D[25], D[26], D[27], D[28], D[29],
                         D[30], D[31], D[32], D[33], D[34], D[35],
                         D[36], D[37], D[38], D[39], D[40], D[41],
                         D[42], D[43], D[44], D[45], D[46], D[47],
                         D[48], D[49], D[50], D[51], D[52], D[53],
                         D[54], D[55], D[56], D[57], D[58], D[59],
                         D[60], D[61], D[62], D[63], momento])
    return pd.DataFrame(features, columns=['mfcc', 'scem','scom', 'srom','sbwm', 'tempo', 'rmse', 
                                           'Fourier1', 'Fourier2', 'Fourier3', 'Fourier4', 'Fourier5', 
                                           'Fourier6', 'Fourier7', 'Fourier8', 'Fourier9', 'Fourier10',
                                           'Fourier11', 'Fourier12', 'Fourier13', 'Fourier14', 'Fourier15',
                                           'Fourier16', 'Fourier17', 'Fourier18', 'Fourier19', 'Fourier20',
                                           'Fourier21', 'Fourier22', 'Fourier23', 'Fourier24', 'Fourier25',
                                           'Fourier26', 'Fourier27', 'Fourier28', 'Fourier29', 'Fourier30',
                                           'Fourier31', 'Fourier32', 'Fourier33', 'Fourier34', 'Fourier35', 
                                           'Fourier36', 'Fourier37', 'Fourier38', 'Fourier39', 'Fourier40',
                                           'Fourier41', 'Fourier42', 'Fourier43', 'Fourier44', 'Fourier45',
                                           'Fourier46', 'Fourier47', 'Fourier48', 'Fourier49', 'Fourier50',
                                           'Fourier51', 'Fourier52', 'Fourier53', 'Fourier54', 'Fourier55',
                                           'Fourier56', 'Fourier57', 'Fourier58', 'Fourier59', 'Fourier60',
                                           'Fourier61', 'Fourier62', 'Fourier63', 'Fourier64', 'momento'])

data = get_features('./audios/fourier_32/', glob('./audios/fourier_32' + '/*.wav'))

In [4]:
data.shape

(14611, 72)

En este paso intermedio, dado que hay una pequeña cantidad de audios cuyas features son todas cero, los elimino para no ensuciar el modelo.

In [5]:
data = data[data['Fourier5'] != 0].reset_index(drop=True)

In [6]:
def show_df_info(dataframe):
    display("The type of the dataframe is {}.".format(type(dataframe)))
    print("------------------------------------------------------------------------")
    display("The amount of entries are {}.".format(dataframe.size))
    print("------------------------------------------------------------------------")
    display("Its shape is {}.".format(dataframe.shape))
    print("------------------------------------------------------------------------")
    display("Its features are: {}.".format(list(dataframe)))
    print("------------------------------------------------------------------------")
    print("The data types of columns are: {}".format(dataframe.dtypes))
    print("------------------------------------------------------------------------")
    print("The counter of each class is: {}".format(dataframe.momento.value_counts()))
    print("------------------------------------------------------------------------")
    print('The amount of null values is: {}.'.format(dataframe.isna().sum()))
    display(dataframe.head())
    display(dataframe.describe())
    
show_df_info(data)

"The type of the dataframe is <class 'pandas.core.frame.DataFrame'>."

------------------------------------------------------------------------


'The amount of entries are 1030824.'

------------------------------------------------------------------------


'Its shape is (14317, 72).'

------------------------------------------------------------------------


"Its features are: ['mfcc', 'scem', 'scom', 'srom', 'sbwm', 'tempo', 'rmse', 'Fourier1', 'Fourier2', 'Fourier3', 'Fourier4', 'Fourier5', 'Fourier6', 'Fourier7', 'Fourier8', 'Fourier9', 'Fourier10', 'Fourier11', 'Fourier12', 'Fourier13', 'Fourier14', 'Fourier15', 'Fourier16', 'Fourier17', 'Fourier18', 'Fourier19', 'Fourier20', 'Fourier21', 'Fourier22', 'Fourier23', 'Fourier24', 'Fourier25', 'Fourier26', 'Fourier27', 'Fourier28', 'Fourier29', 'Fourier30', 'Fourier31', 'Fourier32', 'Fourier33', 'Fourier34', 'Fourier35', 'Fourier36', 'Fourier37', 'Fourier38', 'Fourier39', 'Fourier40', 'Fourier41', 'Fourier42', 'Fourier43', 'Fourier44', 'Fourier45', 'Fourier46', 'Fourier47', 'Fourier48', 'Fourier49', 'Fourier50', 'Fourier51', 'Fourier52', 'Fourier53', 'Fourier54', 'Fourier55', 'Fourier56', 'Fourier57', 'Fourier58', 'Fourier59', 'Fourier60', 'Fourier61', 'Fourier62', 'Fourier63', 'Fourier64', 'momento']."

------------------------------------------------------------------------
The data types of columns are: mfcc         float64
scem         float64
scom         float64
srom         float64
sbwm         float64
tempo        float64
rmse         float64
Fourier1     float64
Fourier2     float64
Fourier3     float64
Fourier4     float64
Fourier5     float64
Fourier6     float64
Fourier7     float64
Fourier8     float64
Fourier9     float64
Fourier10    float64
Fourier11    float64
Fourier12    float64
Fourier13    float64
Fourier14    float64
Fourier15    float64
Fourier16    float64
Fourier17    float64
Fourier18    float64
Fourier19    float64
Fourier20    float64
Fourier21    float64
Fourier22    float64
Fourier23    float64
              ...   
Fourier36    float64
Fourier37    float64
Fourier38    float64
Fourier39    float64
Fourier40    float64
Fourier41    float64
Fourier42    float64
Fourier43    float64
Fourier44    float64
Fourier45    float64
Fourier46    float64
Fourier47    f

Unnamed: 0,mfcc,scem,scom,srom,sbwm,tempo,rmse,Fourier1,Fourier2,Fourier3,...,Fourier56,Fourier57,Fourier58,Fourier59,Fourier60,Fourier61,Fourier62,Fourier63,Fourier64,momento
0,-19.60294,1016.689071,28.116497,1344.604492,526.793735,117.1875,0.082146,0.018555,0.015966,0.02031,...,0.03694,0.063189,0.284137,0.063383,0.102563,0.036719,0.019779,0.02031,0.015966,Timbre
1,-23.112227,1361.218925,25.115322,2646.606445,1025.292241,117.1875,0.055449,0.221948,0.24437,0.24457,...,0.036473,0.084608,0.108805,0.122175,0.109803,0.547955,0.635384,0.24457,0.24437,Cena
2,4.153023,662.289054,21.617422,1497.436523,832.083155,156.25,0.240111,3.205282,12.572833,1.9825,...,0.472631,0.274115,0.177703,0.353226,0.455353,0.41351,0.272553,1.9825,12.572833,Lavadora
3,-15.240392,1539.737568,22.446293,2865.844727,1002.431942,133.928571,0.11392,0.131522,0.614816,0.363291,...,2.669167,0.6694,1.199691,1.602963,3.889082,0.397376,0.761557,0.363291,0.614816,Secador_pelo
4,6.963927,582.127562,19.931546,1348.144531,798.607193,133.928571,0.212797,5.507216,1.900305,0.823831,...,0.154926,0.249078,0.62459,0.252448,0.586256,0.69159,1.555114,0.823831,1.900305,Aire_acondicionado


Unnamed: 0,mfcc,scem,scom,srom,sbwm,tempo,rmse,Fourier1,Fourier2,Fourier3,...,Fourier55,Fourier56,Fourier57,Fourier58,Fourier59,Fourier60,Fourier61,Fourier62,Fourier63,Fourier64
count,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,...,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0,14317.0
mean,-8.975674,1254.326701,22.267985,2249.474068,892.656208,130.348634,0.08845316,1.073632,1.47357,1.051841,...,0.3253452,0.3668938,0.3769518,0.5267566,0.5206251,0.6607482,0.8611619,0.9009214,1.051841,1.47357
std,10.089034,554.559052,3.257823,810.162213,197.184575,19.886981,0.08053672,2.340416,2.559341,1.730712,...,0.4988586,0.6135525,0.6154636,0.8838577,0.6854161,0.9863301,2.271212,1.672453,1.730712,2.559341
min,-56.459336,168.27523,1.698515,291.992188,101.93116,37.5,5.919943e-08,2.800062e-12,2.806961e-12,2.821882e-12,...,3.139653e-12,3.070597e-12,3.003386e-12,2.937453e-12,2.884319e-12,2.853604e-12,2.841517e-12,2.834276e-12,2.821882e-12,2.806961e-12
25%,-16.455608,798.831594,19.928018,1515.258789,805.097363,117.1875,0.01969259,0.04268549,0.07116426,0.06826373,...,0.04051603,0.04038269,0.0417944,0.04158474,0.04357287,0.04706779,0.04959087,0.05670576,0.06826373,0.07116426
50%,-7.940215,1281.012813,21.832554,2380.615234,927.216385,133.928571,0.06708072,0.2037759,0.3259047,0.3398935,...,0.1507105,0.1619576,0.1833593,0.2105103,0.2593778,0.2845868,0.2847706,0.2853103,0.3398935,0.3259047
75%,-3.018813,1738.787047,23.659923,2838.378906,1038.395741,133.928571,0.1351235,0.8272752,1.706695,1.2599,...,0.3903347,0.4485462,0.4874474,0.6199973,0.7520742,0.8248828,0.9101153,1.004417,1.2599,1.706695
max,8.775053,2707.461116,51.275412,3609.375,1299.501746,312.5,0.5108476,26.31798,20.2149,16.47404,...,6.556945,7.642108,8.626682,10.38958,7.717574,7.864245,24.82451,13.76832,16.47404,20.2149


In [7]:
data.to_csv('data_fourier_64.csv', index=False)