### Script para exportar la información del metadata a formato pickle y eliminar el silencio en los audios

Import the libraries

In [24]:
import os
import json
import gzip
import pickle
import collections
import xml.etree.ElementTree as ET

Directories

In [25]:
current_directory = os.getcwd()
xml_path = os.path.join(current_directory, 'data/xml')
wav_path = os.path.join(current_directory, 'data/wav')

Functions

In [26]:
def removeSilence(original_audio):
    
    response = False
    new_audio = None
    
    # new file
    new_audio = original_audio.replace('.wav', '_sil.wav')
    
    if not os.path.isfile(new_audio):
        
        # create new file without silence
        resp = os.system( 'sox ' + original_audio + ' ' + new_audio + ' silence 1 0.1 1% -1 0.1 1%' )
        
        if resp is 0 or os.path.isfile(new_audio):
            response = True
    
    return response, new_audio

In [27]:
# function for save in pickle file all metadata
def exportMetadata(xml_path, wav_path):
    
    data = {}
    
    # for each xml file in path
    for xml_file in os.listdir(xml_path):
        if xml_file.endswith(".xml"):
            dir_xml = os.path.join(xml_path, xml_file)
            dir_wav = os.path.join(wav_path, xml_file.replace('.xml','.wav'))
            
            # verify if exist wav file of xml file
            if os.path.isfile(dir_wav):                
                    
                tmp_dict_xml = {}
                tree = ET.parse(dir_xml)
                root = tree.getroot()       
                
                for child in root:
                    tmp_dict_xml[child.tag] = root.find(child.tag).text
                
                tmp_dict_xml['xmlDir'] = dir_xml
                tmp_dict_xml['wavDir'] = dir_wav

                response, new_audio = removeSilence(dir_wav)
                if response is True:
                    tmp_dict_xml['silDir'] = new_audio
                    
                data[xml_file] = tmp_dict_xml
    
    # save pickle file
    with gzip.open('data.pickle.gz', 'wb') as f:
        pickle.dump((data), f, protocol = 2)
    
    print("Files for process:", len(data))
    print("Done.")
    
    return data

In [28]:
data = exportMetadata(xml_path, wav_path)

Files for process: 350
Done.


In [29]:
# one example of metadata
for key in data:
    metadata = data[key]
    for key_ in metadata:
        print (key_, ':', str(metadata[key_]))
    break

MediaId : 19274
FileName : LIFECLEF2015_BIRDAMAZON_XC_WAV_RN19274.wav
ClassId : lzezgo
Date : 2009-08-08
Time : 10:00
Locality : Parque Natural Chicaque
Latitude : 4.611331
Longitude : -74.3153
Elevation : 2200
Author : Oswaldo Cortes
AuthorID : VVDRVZCVWW
Content : call
Comments : None
Quality : 3
Year : BirdCLEF2015
BackgroundSpecies : None
Order : Passeriformes
Family : Rhinocryptidae
Genus : Scytalopus
Species : latrans
Sub-species : None
VernacularNames : Blackish Tapaculo
xmlDir : /home/bigdata/Documentos/birdPycon2018/data/xml/LIFECLEF2015_BIRDAMAZON_XC_WAV_RN19274.xml
wavDir : /home/bigdata/Documentos/birdPycon2018/data/wav/LIFECLEF2015_BIRDAMAZON_XC_WAV_RN19274.wav
