In [1]:
# dependencies
from os import listdir
from random import choice
import pylab
import wave
import openpyxl
import yaml

In this notebook, we declare functions to obtain a dataset from an Excel ROI file.  
Dataset will be a dictionary of form:  
{
species:  
    {  
        min_freqs:[list]  
        max_freqs:[list]  
        start_time:[list]  
        end_time:[list]  
        recording name:[list]  
    }  
    ...  
}  
We also have a function to create a simplified dataset of form:
{  
species:  
    {  
        min_freq:val  
        max_freq:val  
        avg_time:val  
        recording name:[list]  
    }  
    ...  
}  
     
We will save these dictionaries as yaml files

In [2]:
def wavInfo(rec_file):
    wav_file = wave.open(rec_file, 'r')
    frames = wav_file.readframes(-1)
    wave_info = pylab.fromstring(frames, 'Int16') #all .wavs in our dataset are 16bit
    framerate = wav_file.getframerate()
    wav_file.close()
    return wave_info, framerate

In [3]:
# get info. don't graph
def specInfo(rec_file):
    wave_info, framerate = wavInfo(rec_file)
    spectrum, freqs, t, _ = pylab.specgram(wave_info, NFFT=512, noverlap=256, window=pylab.window_hanning, Fs=framerate)
    del _
    return spectrum, freqs, t
    #pylab.savefig("%s_spectrogram.png" % rec_file)

In [4]:
# search for the index of the leftmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def leftmostBinSearch(A, lo, hi, target):
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 >= target):
        if (mid > 0 and A[mid - 1] > target):
            return leftmostBinSearch(A, lo, mid-1, target)
        else:
            return mid
    elif (A[mid] < target):
        return leftmostBinSearch(A, mid+1, hi, target)
    else:
        return leftmostBinSearch(A, lo, mid-1, target)

# search for the index of the rightmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def rightmostBinSearch(A, lo, hi, target): # something is wrong and it's giving me 1 to the right 
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 <= target):
        if (mid < (len(A) - 1) and A[mid + 1] <= target):
            return leftmostBinSearch(A, mid+1, hi, target)
        else:
            return mid
    elif (A[mid] < target):
        return leftmostBinSearch(A, mid+1, hi, target)
    else:
        return leftmostBinSearch(A, lo, mid-1, target)

In [26]:
def getBounds(A, minVal, maxVal):
    left = leftmostBinSearch(A, 0, len(A)-1, minVal)
    right = rightmostBinSearch(A, 0, len(A)-1, maxVal)
    return left, right

In [27]:
# get modified spectrum of frequencies and times that matter to us
def specMod(spectrum, freqs, times, f1, f2, t1, t2):
    spectrumMod = [spectrum[f1][t1:t2]]
    for f in range(f1+1, f2): # check when fix right limit
            spectrumMod = spectrumMod+ [spectrum[f][t1:t2]]
    return spectrumMod

In [28]:
# based off: http://stackoverflow.com/questions/15961979/how-do-i-plot-a-spectrogram-the-same-way-that-pylabs-specgram-does
# plot the spectrogram of our region of interest
def plotModSpec(specMod, freqs, times, file):
    
    fig = pylab.figure(num=None, figsize=(19, 12))
    fig.xlabel("Time (s)")
    fig.ylabel("Frequency (Hz)")
    fig.title('Spectrogram of %s' % file.split('/')[-1])
    fig.axis('tight')
    fig.xlim(times[0],times[-1])
    fig.pcolormesh(times, freqs, 10 * pylab.log10(specMod))
    fig.axis('tight')
    fig.ylim(freqs[0],freqs[-1])
    fig.show()
    return fig

In [10]:
def speciesData(workbook):
    roi_ws = openpyxl.load_workbook(workbook)['ROIs'] # should change accordingly to where and how you data is stored
    dataset = {}
    # needed format:
    # species specimen per row
    # columns: species name, start_time, end_time, min_freq, max_freq, recording name
    # columns A to F
    sheetMatrix = list(roi_ws.iter_rows())
    # remove row with column names and create array of keys per species. (e.g. start_time, end_time, ...)
    keys = sheetMatrix.pop(0) 
    for row in sheetMatrix:
        speciesName = row[0].value
        if (speciesName not in dataset):
            dataset[speciesName] = {}
        for col in range(1,len(row)):
            cell = ''
            # change recording extension since we are dealing with wav files
            if (col == 5):
                cell = row[col].value.split('.')[0]
                cell += '.wav'
            else:
                cell = row[col].value
            # if per species key is not present add the key and add the value as the first element in a list
            if (keys[col].value not in dataset[speciesName]): 
                dataset[speciesName][keys[col].value] = [cell]
            # append to the list of attributes 
            else:
                dataset[speciesName][keys[col].value] = dataset[speciesName][keys[col].value] + [cell]
    return dataset

In [11]:
def dataToYAML(data, name): # convert speciesData dictionary to yaml and save file
    # need to check if file exists then delete it
    dataset = open(name, 'w')
    dump = yaml.dump(data, dataset, default_flow_style=False)

In [12]:
def findMax(L):
    Max = float('-inf')
    for n in L:
        if (n > Max):
            Max = n
    return Max

def findMin(L):
    Min = float('inf')
    for n in L:
        if (n < Min):
            Min = n
    return Min

In [13]:
#need to decide if exceed bounds of spectrograms or restrict
# WIP. time data is wrong. need to find average time duration
def simplifiedSpeciesData(data): 
    simplDat = {}
    for species in data:
        min_freqs = data[species]['min_frequency']
        max_freqs = data[species]['max_frequency']
        min_times = data[species]['start_time']
        max_times = data[species]['end_time']
        min_f = findMin(min_freqs)
        max_f = findMax(max_freqs)
        start = findMin(min_times)
        end = findMax(max_times)
        simplDat[species] = {'min_freq':min_f, 'max_freq':max_f, 'delta_time':(end - start), 'recording name':data[species]['recording name']}
    return simplDat
        

In [14]:
workbook = '../dataset/validationsAndROIs.xlsx'
data = speciesData(workbook)
dataToYAML(data, 'dataset.yaml')
simpleData = simplifiedSpeciesData(data)
dataToYAML(simpleData, 'simplifiedDataset.yaml')