Loading libraries

In [4]:
import numpy as np
import scipy


def spsi(msgram, fftsize, hop_length) :
    """
    Takes a 2D spectrogram ([freqs,frames]), the fft legnth (= widnow length) and the hope size (both in units of samples).
    Returns an audio signal.
    """
    
    numBins, numFrames  = msgram.shape
    y_out=np.zeros(numFrames*hop_length+fftsize-hop_length)
        
    m_phase=np.zeros(numBins);      
    m_win=scipy.signal.hanning(fftsize, sym=True)  # assumption here that hann was used to create the frames of the spectrogram
    
    #processes one frame of audio at a time
    for i in range(numFrames) :
            m_mag=msgram[:, i] 
            for j in range(1,numBins-1) : 
                if(m_mag[j]>m_mag[j-1] and m_mag[j]>m_mag[j+1]) : #if j is a peak
                    alpha=m_mag[j-1];
                    beta=m_mag[j];
                    gamma=m_mag[j+1];
                    denom=alpha-2*beta+gamma;
                    
                    if(denom!=0) :
                        p=0.5*(alpha-gamma)/denom;
                    else :
                        p=0;
                        
                    phaseRate=2*np.pi*(j+p)/fftsize;    #adjusted phase rate
                    m_phase[j]= m_phase[j] + hop_length*phaseRate; #phase accumulator for this peak bin
                    peakPhase=m_phase[j];
                    
                    # If actual peak is to the right of the bin freq
                    if (p>0) :
                        # First bin to right has pi shift
                        bin=j+1;
                        m_phase[bin]=peakPhase+np.pi;
                        
                        # Bins to left have shift of pi
                        bin=j-1;
                        while((bin>1) and (m_mag[bin]<m_mag[bin+1])) : # until you reach the trough
                            m_phase[bin]=peakPhase+np.pi;
                            bin=bin-1;
                        
                        #Bins to the right (beyond the first) have 0 shift
                        bin=j+2;
                        while((bin<(numBins)) and (m_mag[bin]<m_mag[bin-1])) :
                            m_phase[bin]=peakPhase;
                            bin=bin+1;
                            
                    #if actual peak is to the left of the bin frequency
                    if(p<0) :
                        # First bin to left has pi shift
                        bin=j-1;
                        m_phase[bin]=peakPhase+np.pi;

                        # and bins to the right of me - here I am stuck in the middle with you
                        bin=j+1;
                        while((bin<(numBins)) and (m_mag[bin]<m_mag[bin-1])) :
                            m_phase[bin]=peakPhase+np.pi;
                            bin=bin+1;
                        
                        # and further to the left have zero shift
                        bin=j-2;
                        while((bin>1) and (m_mag[bin]<m_mag[bin+1])) : # until trough
                            m_phase[bin]=peakPhase;
                            bin=bin-1;
                            
                #end ops for peaks
            #end loop over fft bins with

            magphase=m_mag*np.exp(1j*m_phase)  #reconstruct with new phase (elementwise mult)
            magphase[0]=0; magphase[numBins-1] = 0 #remove dc and nyquist
            m_recon=np.concatenate([magphase,np.flip(np.conjugate(magphase[1:numBins-1]), 0)]) 
            
            #overlap and add
            m_recon=np.real(np.fft.ifft(m_recon))*m_win
            y_out[i*hop_length:i*hop_length+fftsize]+=m_recon
            
    return y_out


def magspect2audio(msgram, fftsize, hop_length)  :
    return spsi(msgram, fftsize, hop_length)

def logspect2audio(lsgram, fftsize, hop_length) :
    return spsi(np.power(10, lsgram/20), fftsize, hop_length)







In [6]:
import numpy as np
import librosa
import librosa.display
import argparse
import os
from PIL import Image
from PIL import PngImagePlugin
import json

from spsi import spsi

FLAGS = None
# ------------------------------------------------------
# get any args provided on the command line
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('filename', type=str, help='Name of log mag spectrogram. Include extension')
parser.add_argument('--outdir', type=str, help='Output directory', default='./output')
parser.add_argument('--scalemax', type=int, help='Value to use as the max when scaling from png [0,255] to original [min,max]', default=None)
parser.add_argument('--scalemin', type=int, help='Value to use as the min when scaling from png [0,255] to original [min,max]', default=None)
parser.add_argument('--sr', type=int, help='Samplerate', default=22050) 
parser.add_argument('--hopsize', type=int, help='Size of frame hop through sample file', default=256) 
parser.add_argument('--glsteps', type=int, help='Number of Griffin&Lim iterations following SPSI', default=50 ) 
parser.add_argument('--wavfile', type=str, help='Optional name for output audio file. Unspecified means use the png filename', default=None) 

FLAGS, unparsed = parser.parse_known_args()
print('\n FLAGS parsed :  {0}'.format(FLAGS))


def inv_log(img):
    img = np.exp(img) - 1.
    return img


def PNG2LogSpect(fname,scalemin,scalemax):

    """
    Read png spectrograms, expand to original scale and return numpy array.
    If not stored in one of png metadata, the values needed to undo previous scaling are required to be specified.
    """
    img = Image.open(fname)
    #info = PngImagePlugin.PngInfo()
    
    try:       
        img.text = img.text
        lwinfo = json.loads(img.text['meta'])
    except:
        print('PNG2LogSpect: no img.text, using user specified values!')
        lwinfo = {}
        lwinfo['scaleMin'] = scalemin #require to pass in
        lwinfo['scaleMax'] = scalemax
        #info.add_text('meta',json.dumps(lwinfo))
   
    minx, maxx = float(lwinfo['scaleMin']), float(lwinfo['scaleMax'])
    #minx, maxx = float(lwinfo['oldmin']), float(lwinfo['oldmax'])
    
    img = img.convert('L')
    outimg = np.asarray(img, dtype=np.float32)
    outimg = (outimg - 0)/(255-0)*(maxx-minx) + minx

    return np.flipud(outimg), lwinfo


D,_ = PNG2LogSpect(FLAGS.filename,FLAGS.scalemin,FLAGS.scalemax)
Dsize, _  = D.shape
fftsize = 2*(Dsize-1) #infer fftsize from no. of fft bins i.e. height of image

magD = inv_log(D)
y_out = spsi(magD, fftsize=fftsize, hop_length=FLAGS.hopsize)
#print(magD.shape)
#print(y_out.shape)

if FLAGS.glsteps != 0 : #use spsi result for initial phase 
    x = librosa.stft(y_out, fftsize, FLAGS.hopsize, center=False)
    p = np.angle(x)
    #print(x.shape)
    for i in range(FLAGS.glsteps):
        S = magD * np.exp(1j*p)
        y_out = librosa.istft(S, FLAGS.hopsize, center=True) # Griffin Lim, assumes hann window, librosa only does one iteration?
        p = np.angle(librosa.stft(y_out, fftsize, FLAGS.hopsize, center=True))

scalefactor = np.amax(np.abs(y_out))
#print(np.amin(np.abs(y_out)))
#print(y_out[50:70])
print('scaling peak sample, ' + str(scalefactor) + ' to 1')
#y_out/=scalefactor

if FLAGS.wavfile == None:
    librosa.output.write_wav(os.path.splitext(FLAGS.filename)[0]+'.wav', y_out, FLAGS.sr)
else:
    librosa.output.write_wav(FLAGS.wavfile, y_out, FLAGS.sr)




ModuleNotFoundError: No module named 'spsi'