In [6]:
import essentia
import essentia.standard as es
import glob
import os
import librosa
import soundfile as sf
from essentia.standard import *
import numpy as np
import ntpath
import pandas as pd
import logging
from multiprocessing import Pool
import traceback
import pathlib
logging.basicConfig(filename='audio_feature_extraction.log',level=logging.DEBUG)

import argparse

def get_audio_features(f):
    features, features_frames = es.MusicExtractor(
                                              lowlevelStats=['mean', 'stdev'],
                                              rhythmStats=['mean', 'stdev'],
                                              tonalStats=['mean', 'stdev'],
                                             )(f)
    features_name = features.descriptorNames()
    features_name = list(filter(lambda name: "metadata" not in name,features_name))
    info = {}
    for f in features_name:
        if not isinstance(features[f], np.ndarray):
            info[f] = features[f]
    
    return info

def task(file, ID):
    try:
        feature = get_audio_features(file)
        logging.info("processed file {}".format(file))
        return (feature, ID)
    except Exception as e:
        logging.error(traceback.format_exc())
        return (None, None)

    
def get_all_audio_features_parallel(files, out_csv):

    IDs = list(files) 
    
    with Pool(processes=7) as pool:
        in_out = pool.starmap(task, list(zip(files, IDs)))

    in_out = [p for p in in_out if p[0] is not None]
    input, output = zip(*in_out)    
    features = np.array(input)
    IDs = np.array(output)
    
    keys = features[0].keys()
    data = {}
    data["filepath"] = IDs
    for k in keys:
        data[k] = [feature[k] for feature in features]
        
    df = pd.DataFrame(data=data)
    df["ID"] = df['filepath'].apply(lambda x: pathlib.Path(x).parents[0].stem + "/" + pathlib.Path(x).stem)
    df.to_csv(out_csv)
    return df 
#get_all_audio_features_parallel(list_dir=in_wav_dirs, out_csv=csv_path)
# get_all_audio_features_parallel(["/data/zalo/hit-song-prediction/train-wav-samples", "/data/zalo/hit-song-prediction/test-wav-samples"])


in_wav_dirs = "/media/ben/datadrive/Zalo/voice-verification/Train-Test-Data/public-test/" #Video directories that contains wav files
out_csv = "features_public_test.csv"
files = glob.glob(in_wav_dirs + "/*.wav")

#get_audio_features(os.path.join(file_list, "0auqADJgo6MHz2ET5vv4.wav"))
df = get_all_audio_features_parallel(files, out_csv=out_csv)

KeyboardInterrupt: 

In [10]:
in_wav_dirs = "/media/ben/datadrive/Zalo/voice-verification/Train-Test-Data/dataset" #Video directories that contains wav files
files = glob.glob(in_wav_dirs + "/*/*.wav")
out_csv = "features_train-test-data.csv"
#get_audio_features(os.path.join(file_list, "0auqADJgo6MHz2ET5vv4.wav"))
df = get_all_audio_features_parallel(files, out_csv=out_csv)