In [1]:
## extracts features from audio files and converts into numpy
import librosa
import numpy as np
import os, re, csv
from datetime import datetime

global hop_length

# Set the hop length; at 22050 Hz, 512 samples ~= 23ms
hop_length = 128


In [2]:
##save the current directory
cwd = os.getcwd()

##change to sound file directory !!hard-coded
audio_dir = '/Users/panchanok/Desktop/PyHack2019/PyHack2019/sound_samples/aggr/'
os.chdir(audio_dir)

##list files in the directory
audio_files = os.listdir(audio_dir)
print('Taking: ', audio_files)


## return a (flatten) one-D array of mfcc of an audio file
def getMFCC(audio_file):

    #print('*getting ', audio_file)
    y, sr = librosa.load(audio_file)
    # Compute MFCC features from the raw signal
    return librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13).flatten()

## return a (flatten) one-D array of chromagram of an audio file
def getChroma(audio_file):

    y, sr = librosa.load(audio_file)
    # Separate harmonics and percussives into two waveforms
    y_harmonic, y_percussive = librosa.effects.hpss(y)

    # Compute chroma features from the harmonic signal
    return librosa.feature.chroma_cqt(y=y_harmonic,
                                            sr=sr).flatten()


## return a list of 1-d array of MFCC padded with 0's of ALL audio files
def getPaddedMFCC(audio_files):
    result = [getMFCC(f) for f in audio_files]

    ##pad arrays with 0's. Get arrays of size Max
    max_len = max([len(x) for x in result])
    padded = [np.pad(x, (0, max_len - len(x)), mode = 'constant') for x in result]

    ##sanity check
    is_shorter = sum([len(x) - max_len for x in padded])
    if is_shorter < 0:
        print('not padded well')
        return -1
    else:
        return padded

## return a list of 1-d array of chromagram padded with 0's of ALL audio files
def getPaddedChroma(audio_files):
    result = [getChroma(f) for f in audio_files]

    ##pad arrays with 0's. Get arrays of size Max
    max_len = max([len(x) for x in result])
    padded = [np.pad(x, (0, max_len - len(x)), mode = 'constant') for x in result]

    ##sanity check
    is_shorter = sum([len(x) - max_len for x in padded])
    if is_shorter < 0:
        print('not padded well')
        return -1
    else:
        return padded

print('getting mfcc')
mfcc = getPaddedMFCC(audio_files)
print('getting chromagram')
chrom = getPaddedChroma(audio_files)

if mfcc == -1 or chrom == -1:
    print('ATTENTION: some instance is not padded')
    exit()


Taking:  ['si1_FV3_MP3.mp3', 'ceng3_MV3_MP3.mp3', 'xi3_MV1_MP3.mp3', 'de3_MV1_MP3.mp3', 'si1_MV2_MP3.mp3', 'pou4_MV1_MP3.mp3', 'quan1_MV1_MP3.mp3', 'xia3_FV2_MP3.mp3', 'que1_MV1_MP3.mp3', 'cha3_MV3_MP3.mp3', 'si2_FV1_MP3.mp3', 'xia3_FV3_MP3.mp3', 'dei1_MV2_MP3.mp3', 'si2_MV1_MP3.mp3', 'dei1_FV3_MP3.mp3', 'chi4_FV1_MP3.mp3', 'xi3_FV1_MP3.mp3', 'si1_FV2_MP3.mp3', 'si1_MV3_MP3.mp3', 'shei1_MV2_MP3.mp3', 'que1_MV2_MP3.mp3', 'xia3_FV1_MP3.mp3', 'quan1_MV2_MP3.mp3', 'si2_FV2_MP3.mp3', 'que1_FV3_MP3.mp3', 'quan1_FV3_MP3.mp3', 'chi4_FV3_MP3.mp3', 'de3_FV3_MP3.mp3', 'xi3_FV3_MP3.mp3', 'pou2_MV1_MP3.mp3', 'si1_MV1_MP3.mp3', 'de3_MV2_MP3.mp3', 'xi3_MV2_MP3.mp3', 'si1_FV1_MP3.mp3', 'xi3_FV2_MP3.mp3', 'xi3_MV3_MP3.mp3', 'po4_MV1_MP3.mp3', 'dei1_MV1_MP3.mp3', 'ao2_MV2_MP3.mp3', 'si2_FV3_MP3.mp3', 'chi4_FV2_MP3.mp3', 'quan3_MV2_MP3.mp3', 'xia1_FV1_MP3.mp3', 'chi3_MV3_MP3.mp3', 'pu4_MV1_MP3.mp3', 'cong4_FV2_MP3.mp3', 'quan3_FV3_MP3.mp3', 'wang2_MV3_MP3.mp3', 'xia2_MV2_MP3.mp3', 'de1_FV3_MP3.mp3', 'xi4

If there is error from the above chunk, make sure only mp3 is included in aggr/. Check even invisible .dstore

In [3]:

## concatenate mfcc and chrom features
attr_input = [np.hstack([m, c]) for m, c in zip(mfcc, chrom)]

## checking the final length
#print(len(mfcc[5]), len(chrom[5]), len(x[5]))


##detect targets from sound names
p = re.compile('^[aeou]|[bcdfghjklmnpqrstwxyz]+(?=[aeiou])')
target_input = [p.match(f).group() for f in audio_files]

##for debugging
# for i in range(len(audio_files)):
#     f = audio_files[i]
#     print(f)
#     p.match(f).group()

## check corrrectness
#print(target)
#print(audio_files)


# ## tag labels to attributes. Return 2-d array.
# labeled_input = np.array([np.hstack([i, l]) for i, l in zip(att_input, target)])


os.chdir(cwd)
this_time = datetime.now().strftime('%H_%M_%S')
attr_export_name = 'attr_in_' + this_time + '.csv'
target_export_name = 'target_in_' + this_time + '.csv'

with open(attr_export_name,"w+") as processed:
    csvWriter = csv.writer(processed,delimiter=',')
    csvWriter.writerows(attr_input)

with open(target_export_name,"w+") as processed:
    csvWriter = csv.writer(processed,delimiter=',')
    csvWriter.writerows(target_input)

## check final lengths
#print(len(att_input[3]), len(labeled_input[3]), labeled_input)
print('Attribute data saved as ', attr_export_name)
print('Target data saved as ', target_export_name)


Attribute data saved as  attr_in_16_43_55.csv
Target data saved as  target_in_16_43_55.csv
