In [None]:
%matplotlib inline
import numpy , scipy, matplotlib.pyplot as plt, IPython.display as ipd
import librosa, librosa.display

import warnings
warnings.simplefilter('ignore')

# Short-Time Fourier Transform

Musical signals are highly non-stationary, i.e., their statistics change over time. It would be rather meaningless to compute a single Fourier transform over an entire 10-minute song.

The short-time Fourier transform (STFT) (Wikipedia; FMP, p. 53) is obtained by computing the Fourier transform for successive frames in a signal.

X(m,ω)=∑nx(n)w(n−m)exp(−jωn)
 
As we increase  m , we slide the window function  w  to the right. For the resulting frame,  x(n)w(n−m) , we compute the Fourier transform. Therefore, the STFT  X  is a function of both time,  m , and frequency,  ω .

Let's load a file:

In [None]:
x, sr = librosa.load('mp3/Adele - Rolling in the Deep-rYEDA3JcQqw.mp3')
ipd.Audio(x, rate=sr)

librosa.stft computes a STFT. We provide it a frame size, i.e. the size of the FFT, and a hop length, i.e. the frame increment:

In [None]:
hop_length = 1024
n_fft = 4096
X = librosa.stft(x, n_fft=n_fft, hop_length=hop_length)

To convert the hop length and frame size to units of seconds:



In [None]:
float(hop_length)/sr # units of seconds

In [None]:
float(n_fft)/sr  # units of seconds

For real-valued signals, the Fourier transform is symmetric about the midpoint. Therefore, librosa.stft only retains one half of the output:

In [None]:
X.shape

This STFT has 1025 frequency bins and 10075 frames in time.

In [None]:
import pandas as pd

df=pd.DataFrame(X)
df.head()

In [None]:
#df['10075']=[i*0+0j for i in range(len(df))]
df['4858']=[i*0+0j for i in range(len(df))]
df['4859']=[i*0+0j for i in range(len(df))]
df=df.append([0*i for i in range(3)], ignore_index = True)

df=df.fillna(0)

In [None]:
df.tail()

In [None]:
df=df[[i for i in range(len(df.columns)-8)]]
df.shape

In [None]:
real=df.apply(lambda x: x.real)
real.head()

In [None]:
imag=df.apply(lambda x: x.imag)
imag.head()

# Spectrogram

In music processing, we often only care about the spectral magnitude and not the phase content.

The spectrogram (Wikipedia; FMP, p. 29, 55) shows the the intensity of frequencies over time. A spectrogram is simply the squared magnitude of the STFT:

S(m,ω)=|X(m,ω)|^2
 
The human perception of sound intensity is logarithmic in nature. Therefore, we are often interested in the log amplitude:

In [None]:
S = librosa.amplitude_to_db(abs(X))

To display any type of spectrogram in librosa, use librosa.display.specshow.

In [None]:
plt.figure(figsize=(15, 5))
librosa.display.specshow(S, sr=sr, hop_length=hop_length, x_axis='time', y_axis='linear')
plt.colorbar(format='%+2.0f dB');

# Mel-spectrogram
librosa has some outstanding spectral representations, including librosa.feature.melspectrogram:

In [None]:
hop_length = 256
S = librosa.feature.melspectrogram(x, sr=sr, n_fft=4096, hop_length=hop_length)

In [None]:
ipd.Audio(librosa.core.istft(S), rate=sr)

The human perception of sound intensity is logarithmic in nature. Therefore, like the STFT-based spectrogram, we are often interested in the log amplitude:

In [None]:
logS = librosa.power_to_db(abs(S))

In [None]:
ipd.Audio(librosa.core.istft(logS), rate=sr)

To display any type of spectrogram in librosa, use librosa.display.specshow.



In [None]:
plt.figure(figsize=(15, 5))
librosa.display.specshow(logS, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB');

Using y_axis=mel plots the y-axis on the mel scale which is similar to the  log(1+f)  function:

m=2595log10(1+f/700)

# librosa.cqt
Unlike the Fourier transform, but similar to the mel scale, the constant-Q transform uses a logarithmically spaced frequency axis.

To plot a constant-Q spectrogram, will use librosa.cqt:

In [None]:
fmin = librosa.midi_to_hz(36)
C = librosa.cqt(x, sr=sr, fmin=fmin, n_bins=72)
logC = librosa.amplitude_to_db(abs(C))

In [None]:
plt.figure(figsize=(15, 5))
librosa.display.specshow(logC, sr=sr, x_axis='time', y_axis='cqt_note', fmin=fmin, cmap='coolwarm')
plt.colorbar(format='%+2.0f dB');

# Istft

In [None]:
mp3=librosa.core.istft(df.values)

In [None]:
mp3

In [None]:
ipd.Audio(mp3, rate=sr)

# Keras

In [None]:
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model
from keras import backend as K

input_img = Input(shape=(df.shape[0], df.shape[1], 1))  

x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(32, (2, 2), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='valid')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(64, (2, 2), activation='relu', padding='valid')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation=None, padding='valid')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse') #loss='binary_crossentropy')
autoencoder.summary()

In [None]:
autoencoder.fit(real.values.reshape(1, df.shape[0], df.shape[1], 1), 
                real.values.reshape(1, df.shape[0], df.shape[1], 1),
                epochs=15,
                batch_size=1)

In [None]:
mp3_R = autoencoder.predict(real.values.reshape(1, real.shape[0], real.shape[1], 1))

In [None]:
mp3_R_df=pd.DataFrame(mp3_R.reshape(mp3_R.shape[1], mp3_R.shape[2]))

In [None]:
#real

In [None]:
autoencoder.fit(imag.values.reshape(1, df.shape[0], df.shape[1], 1), 
                imag.values.reshape(1, df.shape[0], df.shape[1], 1),
                epochs=15,
                batch_size=1)
mp3_I = autoencoder.predict(imag.values.reshape(1, imag.shape[0], imag.shape[1], 1))
mp3_I_df=pd.DataFrame(mp3_I.reshape(mp3_I.shape[1], mp3_I.shape[2]))

In [None]:
df_recons=mp3_R_df+mp3_I_df*1j

In [None]:
recons=librosa.core.istft(df_recons.values)
ipd.Audio(recons, rate=sr)

In [None]:
df2=pd.DataFrame(S)
df2.head()

In [None]:
df2['19432']=[i*0 for i in range(len(df2))]
df2['19433']=[i*0 for i in range(len(df2))]
df2['19434']=[i*0 for i in range(len(df2))]
df2['19435']=[i*0 for i in range(len(df2))]
df2['19436']=[i*0 for i in range(len(df2))]
df2['19437']=[i*0 for i in range(len(df2))]
df2['19438']=[i*0 for i in range(len(df2))]
df2['19439']=[i*0 for i in range(len(df2))]
df2.shape

In [None]:
autoencoder.fit(df.values.reshape(1, df.shape[0], df.shape[1], 1), 
                df.values.reshape(1, df.shape[0], df.shape[1], 1),
                epochs=15,
                batch_size=10)
mp3 = autoencoder.predict(df.values.reshape(1, df.shape[0], df.shape[1], 1))
mp3_df=pd.DataFrame(mp3.reshape(mp3.shape[1], mp3.shape[2]))

recons=librosa.core.istft(mp3_df.values)
ipd.Audio(recons, rate=sr)

In [None]:
recons=librosa.core.griffinlim(mp3_df.values, n_iter=100)
ipd.Audio(recons, rate=sr)

In [None]:
input_img = Input(shape=(df2.shape[0], df2.shape[1], 1))  

x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(32, (2, 2), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(64, (2, 2), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (2, 2), activation=None, padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adadelta', loss='mse') #loss='binary_crossentropy')
autoencoder.summary()

In [None]:
autoencoder.fit(df2.values.reshape(1, df2.shape[0], df2.shape[1], 1), 
                df2.values.reshape(1, df2.shape[0], df2.shape[1], 1),
                epochs=15,
                batch_size=1)
mp3 = autoencoder.predict(df2.values.reshape(1, df2.shape[0], df2.shape[1], 1))
mp3_df=pd.DataFrame(mp3.reshape(mp3.shape[1], mp3.shape[2]))

recons=librosa.core.griffinlim(librosa.db_to_power(mp3_df.values), n_iter=40)
ipd.Audio(recons, rate=sr)

In [None]:
recons=librosa.core.griffinlim(mp3_df.values, n_iter=40)
ipd.Audio(librosa.db_to_power(recons), rate=sr)

In [None]:
mp3_df