In [73]:
# dependencies
import pylab
import wave
import openpyxl
import yaml
import os
import shutil
import _pickle as cpl # import cPickle
import tarfile

In this notebook, we declare functions to obtain a dataset from an Excel ROI file.  
Dataset will be a dictionary of form:  
{
species:  
    {  
        min_freqs:[list]  
        max_freqs:[list]  
        start_time:[list]  
        end_time:[list]  
        recording name:[list]  
    }  
    ...  
}  
We also have a function to create a simplified dataset of form:
{  
species:  
    {  
        min_freq:val  
        max_freq:val  
        avg_time:val  
        recording name:[list]  
    }  
    ...  
}  
     
We will save these dictionaries as yaml files

In [74]:
def wavInfo(rec_file):
    wav_file = wave.open(rec_file, 'r')
    frames = wav_file.readframes(-1)
    wave_info = pylab.fromstring(frames, 'Int16') #all .wavs in our dataset are 16bit
    framerate = wav_file.getframerate()
    wav_file.close()
    return wave_info, framerate

In [75]:
# get info. don't graph
def specInfo(rec_file):
    wave_info, framerate = wavInfo(rec_file)
    spectrum, freqs, t, _ = pylab.specgram(wave_info, NFFT=512, noverlap=256, window=pylab.window_hanning, Fs=framerate)
    del _
    return spectrum, freqs, t
    #pylab.savefig("%s_spectrogram.png" % rec_file)

In [76]:
# search for the index of the leftmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def leftmostBinSearch(A, lo, hi, target):
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 >= target):
        if (mid > 0 and A[mid - 1] > target):
            return leftmostBinSearch(A, lo, mid-1, target)
        else:
            return mid
    elif (A[mid] < target):
        return leftmostBinSearch(A, mid+1, hi, target)
    else:
        return leftmostBinSearch(A, lo, mid-1, target)

# search for the index of the rightmost value in an ordered array 
# (of times or frequencies in our case) that still meet our criteria
def rightmostBinSearch(A, lo, hi, target): # something is wrong and it's giving me 1 to the right 
    mid = (lo + hi) // 2
    v1 = A[mid]
    if (v1 <= target):
        if (mid < (len(A) - 1) and A[mid + 1] <= target):
            return rightmostBinSearch(A, mid+1, hi, target)
        else:
            return mid
    elif (A[mid] < target):
        return rightmostBinSearch(A, mid+1, hi, target)
    else:
        return rightmostBinSearch(A, lo, mid-1, target)

In [77]:
def getBounds(A, minVal, maxVal):
    left = leftmostBinSearch(A, 0, len(A)-1, minVal)
    right = rightmostBinSearch(A, 0, len(A)-1, maxVal)
    return left, right

In [78]:
# get modified spectrum of frequencies and times that matter to us
def specMod(spectrum, freqs, times, f1, f2, t1, t2):
    spectrumMod = [spectrum[f1][t1:t2]]
    for f in range(f1+1, f2): # check when fix right limit
            spectrumMod = spectrumMod + [spectrum[f][t1:t2]]
    return spectrumMod

In [79]:
# based off: http://stackoverflow.com/questions/15961979/how-do-i-plot-a-spectrogram-the-same-way-that-pylabs-specgram-does
# plot the spectrogram of our region of interest
def plotModSpecSimple(specMod, freqs, times, file):
    
    fig, ax = pylab.subplots(1)
    pylab.pcolormesh(times, freqs, 10 * pylab.log10(specMod))
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
    ax.axis('tight')
    ax.axis('off')
    pylab.savefig(file)
    return fig

In [80]:
def speciesData(workbook):
    roi_ws = openpyxl.load_workbook(workbook)['ROIs'] # should change accordingly to where and how you data is stored
    dataset = {}
    # needed format:
    # species specimen per row
    # columns: species name, start_time, end_time, min_freq, max_freq, recording name
    # columns A to F
    sheetMatrix = list(roi_ws.iter_rows())
    # remove row with column names and create array of keys per species. (e.g. start_time, end_time, ...)
    keys = sheetMatrix.pop(0) 
    for row in sheetMatrix:
        speciesName = row[0].value
        if (speciesName not in dataset):
            dataset[speciesName] = {}
        for col in range(1,len(row)):
            cell = ''
            # change recording extension since we are dealing with wav files
            if (col == 5):
                cell = row[col].value.split('.')[0]
                cell += '.wav'
            else:
                cell = row[col].value
            # if per species key is not present add the key and add the value as the first element in a list
            if (keys[col].value not in dataset[speciesName]): 
                dataset[speciesName][keys[col].value] = [cell]
            # append to the list of attributes 
            else:
                dataset[speciesName][keys[col].value] = dataset[speciesName][keys[col].value] + [cell]
    return dataset

In [81]:
def dataToYAML(data, name): # convert speciesData dictionary to yaml and save file
    # need to check if file exists then delete it
    path = '../dataset/' + name
    dataset = open(path, 'w+')
    dump = yaml.dump(data, dataset, default_flow_style=False)
    dataset.close()

In [82]:
def findMax(L):
    Max = float('-inf')
    for n in L:
        if (n > Max):
            Max = n
    return Max

def findMin(L):
    Min = float('inf')
    for n in L:
        if (n < Min):
            Min = n
    return Min

In [83]:
#need to decide if exceed bounds of spectrograms or restrict
# WIP. time data is wrong. need to find average time duration
def simplifiedSpeciesData(data): 
    simplDat = {}
    for species in data:
        min_freqs = data[species]['min_frequency']
        max_freqs = data[species]['max_frequency']
        min_times = data[species]['start_time']
        max_times = data[species]['end_time']
        min_f = findMin(min_freqs)
        max_f = findMax(max_freqs)
        start = findMin(min_times)
        end = findMax(max_times)
        simplDat[species] = {'min_freq':min_f, 'max_freq':max_f, 'delta_time':(end - start), 'recording name':data[species]['recording name']}
    return simplDat
        

In [84]:
# save our species data dictionary as a .yaml file for later use
workbook = '../dataset/validationsAndROIs.xlsx'
data = speciesData(workbook)
dataToYAML(data, 'dataset.yaml')
#simpleData = simplifiedSpeciesData(data)
#dataToYAML(simpleData, 'simplifiedDataset.yaml')

Now we'll use our previous functions to go through our generated dictionary and create a  
Raw unprocessed dataset of spectrogram of ROIs in appropriate directories per species  
and returning our dataset as a 2D array of dictionaries that contain our images.  

In [85]:
def getRawSpecDataset(dataset, path='../dataset'):
    
    # make directory to store our spec dataset
    dataset_path = path + '/spectrogram_roi_dataset'
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    else:
        shutil.rmtree(dataset_path)
        os.makedirs(dataset_path)
    species = dataset.keys()
    
    # image data to be pickled 
    specs = []
    
    for s in species:
        s_dir = dataset_path + '/' + s
        s_spec = []
        os.makedirs(s_dir) # make a directory per species
        
        # load species ROI data
        min_freqs = dataset[s]['min_frequency']
        max_freqs = dataset[s]['max_frequency']
        starts = dataset[s]['start_time']
        ends = dataset[s]['end_time']
        recs = dataset[s]['recording name']
        
        for i in range(0, len(recs)):
            rec = '../dataset/recordings/' + recs[i] # path to ith recording file where s is present
            spectrum, freqs, times = specInfo(rec) # get entire spectrogram data from rec
            
            # get ROI info in rec
            t_0 = starts[i] 
            t_n = ends[i]
            f_0 = min_freqs[i]
            f_n = max_freqs[i]
            
            # find closest times and freqs that match ROI info
            t_start, t_end = getBounds(times, t_0, t_n)
            f_start, f_end = getBounds(freqs, f_0, f_n)
            
            # get modified spectrum, freqs, and times
            spectrumMod = specMod(spectrum, freqs, times, f_start, f_end, t_start, t_end)
            freqMod = freqs[f_start:f_end]
            timeMod = times[t_start:t_end]
            filename = s_dir + '/' + s + '_spec_' + str(i+1) + '.png'
            
            # plot the spectrogram of ROI and save the image 
            f = plotModSpecSimple(spectrumMod, freqMod, timeMod, filename)
            s_spec.append(f) # append image to list of ROI spectrograms per species
            pylab.close
        
        # add dictionary with key <species_name> and value <list_of_spectrogram_figures>
        specs.append({s:s_spec}) 
    
    return specs

In [86]:
yamlData = open('../dataset/dataset.yaml', 'r')
dataset = yaml.load(yamlData)
yamlData.close()
data = getRawSpecDataset(dataset)



Finally, we'll package our dataset in a more manageble format by:  
* archiving and compressing our dataset directory into a .tar.bz2 file
* serializing our dataset object into a folder of pickles (was recieving errors when  
  I tried to serialize the whole object. maybe too big?)

In [87]:
def serializeDataset(obj, path='../dataset'):
    pickle_path = path + '/pickle_data'
    # create pickle directory if exists, else overwrite it
    if not os.path.exists(pickle_path):
        os.makedirs(pickle_path)
    else:
        shutil.rmtree(pickle_path)
        os.makedirs(pickle_path)
    for s in obj:
        species = list(s.keys())[0]
        data = s[species]
        picklename = pickle_path + '/' + species + '.pickle'
        with open(picklename, 'wb+') as pn:
            cpl.dump(data, pn)

In [88]:
serializeDataset(data)

In [89]:
def archiveAndCompress(path):
    directory = path.split('/')[0:-1]
    directory = '/'.join(directory)
    archive_name = directory + '/' + path.split('/')[-1] + '.tar.bz2'
    with tarfile.open(archive_name, 'w:bz2') as archive:
        folder = os.listdir(path)
        for f in folder:
            f = path + '/' + f
            archive.add(f, arcname=os.path.basename(f))

In [90]:
archiveAndCompress('../dataset/spectrogram_roi_dataset')
archiveAndCompress('../dataset/pickle_data')

Running these functions should result in:
* .yaml file that contains the structure of our dataset
* a spectrogram_roi_dataset directory that contains directories of species and their ROI spectrograms
* a spectrogram_roi_dataset.tar.bz2
* a pickle_data direcrory that contains a pickle of the ROI spectrograms of each species
* a pickle_data.tar.bz2  