Permalink
Switch branches/tags
Nothing to show
Find file Copy path
0a735a6 Oct 10, 2015
1 contributor

Users who have contributed to this file

100 lines (80 sloc) 3.76 KB
import numpy as np
from matplotlib import pyplot as plt
import scipy.io.wavfile as wav
from numpy.lib import stride_tricks
import PIL.Image as Image
import os
""" short time fourier transform of audio signal """
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize)
hopSize = int(frameSize - np.floor(overlapFac * frameSize))
# zeros at beginning (thus center of 1st window should be for sample nr. 0)
samples = np.append(np.zeros(np.floor(frameSize/2.0)), sig)
# cols for windowing
cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1
# zeros at end (thus samples can be fully covered by frames)
samples = np.append(samples, np.zeros(frameSize))
frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
frames *= win
return np.fft.rfft(frames)
""" scale frequency axis logarithmically """
def logscale_spec(spec, sr=44100, factor=20., alpha=1.0, f0=0.9, fmax=1):
spec = spec[:, 0:256]
timebins, freqbins = np.shape(spec)
scale = np.linspace(0, 1, freqbins) #** factor
# http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=650310&url=http%3A%2F%2Fieeexplore.ieee.org%2Fiel4%2F89%2F14168%2F00650310
scale = np.array(map(lambda x: x * alpha if x <= f0 else (fmax-alpha*f0)/(fmax-f0)*(x-f0)+alpha*f0, scale))
scale *= (freqbins-1)/max(scale)
newspec = np.complex128(np.zeros([timebins, freqbins]))
allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
freqs = [0.0 for i in range(freqbins)]
totw = [0.0 for i in range(freqbins)]
for i in range(0, freqbins):
if (i < 1 or i + 1 >= freqbins):
newspec[:, i] += spec[:, i]
freqs[i] += allfreqs[i]
totw[i] += 1.0
continue
else:
# scale[15] = 17.2
w_up = scale[i] - np.floor(scale[i])
w_down = 1 - w_up
j = int(np.floor(scale[i]))
newspec[:, j] += w_down * spec[:, i]
freqs[j] += w_down * allfreqs[i]
totw[j] += w_down
newspec[:, j + 1] += w_up * spec[:, i]
freqs[j + 1] += w_up * allfreqs[i]
totw[j + 1] += w_up
for i in range(len(freqs)):
if (totw[i] > 1e-6):
freqs[i] /= totw[i]
return newspec, freqs
""" plot spectrogram"""
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="gray", channel=0, name='tmp.png', alpha=1, offset=0):
samplerate, samples = wav.read(audiopath)
samples = samples[:, channel]
s = stft(samples, binsize)
sshow, freq = logscale_spec(s, factor=1, sr=samplerate, alpha=alpha)
sshow = sshow[2:, :]
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
timebins, freqbins = np.shape(ims)
ims = np.transpose(ims)
ims = ims[0:256, offset:offset+768] # 0-11khz, ~9s interval
#print "ims.shape", ims.shape
image = Image.fromarray(ims)
image = image.convert('L')
image.save(name)
file = open('trainingData.csv', 'r')
for iter, line in enumerate(file.readlines()[1:]): # first line of traininData.csv is header (only for trainingData.csv)
filepath = line.split(',')[0]
filename = filepath[:-4]
wavfile = 'tmp.wav'
os.system('mpg123 -w ' + wavfile + ' /home/brainstorm/caffe/Data/mnt/3/language/train/mp3/' + filepath)
for augmentIdx in range(0, 20):
alpha = np.random.uniform(0.9, 1.1)
offset = np.random.randint(90)
plotstft(wavfile, channel=0, name='/home/brainstorm/data/language/train/pngaugm/'+filename+'.'+str(augmentIdx)+'.png',
alpha=alpha, offset=offset)
os.remove(wavfile)
print "processed %d files" % (iter + 1)