In [1]:
## extracts features from audio files and converts into numpy
import librosa
import numpy as np
import os
import re

##save the current directory
cwd = os.getcwd()

##change to sound file directory
audio_dir = '/Users/panchanok/Desktop/PyHack2019/sound/tone_perfect/'
os.chdir(audio_dir)

##list files in the directory
audio_files = os.listdir(audio_dir)[0:10]
#print(audio_files)

In [2]:
global hop_length

# Set the hop length; at 22050 Hz, 512 samples ~= 23ms
hop_length = 128

## return a one-D array with mcff, mcff_delta, and chomagram of the audio file
def getMFCC(audio_file):
    
    print('*getting ', audio_file)
    y, sr = librosa.load(audio_file)
    # Compute MFCC features from the raw signal
    return librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13).flatten()


def getChroma(audio_file):
    
    y, sr = librosa.load(audio_file)
    # Separate harmonics and percussives into two waveforms
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    
    # Compute chroma features from the harmonic signal
    return librosa.feature.chroma_cqt(y=y_harmonic,
                                            sr=sr).flatten()

In [3]:
## return a 2-d array of MFCC padded with 0's
## Each array element is of an audio
def getPaddedMFCC(audio_files):
    input = [getMFCC(f) for f in audio_files]
    
    ##pad arrays with 0's. Get arrays of size Max
    max_len = max([len(x) for x in input])
    padded = [np.pad(x, (0, max_len - len(x))) for x in input]
    
    ##sanity check
    is_shorter = sum([len(x) - max_len for x in padded])
    if is_shorter < 0:
        print('not padded well')
        return -1
    else:
        return padded
    
## return a 2-d array of chromagram   ,y h .    padded with 0's
## Each array element is of an audio
def getPaddedChroma(audio_files):
    input = [getChroma(f) for f in audio_files]
    
    ##pad arrays with 0's. Get arrays of size Max
    max_len = max([len(x) for x in input])
    padded = [np.pad(x, (0, max_len - len(x))) for x in input]
    
    ##sanity check
    is_shorter = sum([len(x) - max_len for x in padded])
    if is_shorter < 0:
        print('not padded well')
        return -1
    else:
        return padded 

In [4]:
mfcc = getPaddedMFCC(audio_files)
chrom = getPaddedChroma(audio_files)

## concatenate mfcc and chrom features
att_input = [np.hstack([m, c]) for m, c in zip(mfcc, chrom)]

## checking the final length
#print(len(mfcc[5]), len(chrom[5]), len(x[5]))

*getting  sa3_MV1_MP3.mp3
*getting  cao2_FV1_MP3.mp3
*getting  hong1_MV2_MP3.mp3
*getting  zheng3_MV3_MP3.mp3
*getting  shao4_MV1_MP3.mp3
*getting  ke4_MV3_MP3.mp3
*getting  kai4_MV2_MP3.mp3
*getting  ken4_FV3_MP3.mp3
*getting  chu3_FV1_MP3.mp3
*getting  nong4_FV1_MP3.mp3


In [5]:
##detect targets from sound names
p = re.compile('^[aeo]|[bcdfghjklmnpqrstwz]+(?=[aeou])')
#sample = 'wei'

#p.match(sample).group()
target = [p.match(f).group() for f in audio_files]

## check corrrectness
#print(target)
#print(audio_files)


## tag labels to attributes
labeled_input = [np.hstack([i, l]) for i, l in zip(att_input, target)]

## check final lengths
#print(len(att_input[3]), len(labeled_input[3]), labeled_input)

In [10]:
len(labeled_input[1])

2676