In [35]:
%matplotlib inline

import numpy
import scipy.io.wavfile
import scipy.signal as signal
import matplotlib.pyplot as plt
from matplotlib import cm
import math
import os
from functools import reduce
import glob
import operator as op
from PIL import Image

wav_folder_path = os.path.abspath("../../Datasets/IEMOCAP_mono/{0}/sentences/wav/{1}/{2}.wav")

## Spectrogram converter class

In [2]:
noise_factor = -1.75

class SpectrogramConverter:
    
    def __init__(self, frame_size, frame_stride, fft_size):
        self.frame_size = frame_size
        self.frame_stride = frame_stride
        self.fft_size = fft_size
   
    def butter_bandpass_filter(self, lowcut, highcut, order=1):
        nyquist_frequency = 0.5 * self.sample_rate
        low_frequency = lowcut / nyquist_frequency
        high_frequency = highcut / nyquist_frequency
        b, a = scipy.signal.butter(order, [low_frequency, high_frequency], btype='band')
        filtered_signal = scipy.signal.lfilter(b, a, self.signal)
        return filtered_signal
    
    def mel_to_hertz(self, mels):
        return 700*(10**(mels/2595.0)-1)
    
    def hertz_to_mel(self, frequency):
        return 2595 * numpy.log10((1 + frequency)/700.)
    
    def load_wav(self, wavfile_path):
        self.sample_rate, self.signal = scipy.io.wavfile.read(wavfile_path)
        
    def pre_emphasis(self, emphasis=0.97):
        self.signal = numpy.append(self.signal[0], self.signal[1:] - emphasis * self.signal[:-1])
    
    def windowing(self):
        frame_length, frame_step = int(round(self.frame_size * self.sample_rate)), int(round(self.frame_stride * self.sample_rate))
        signal_length = len(self.signal)
        frame_amount = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
        
        pad_signal_length = frame_amount * frame_step + frame_length
        zero_padding = numpy.zeros((pad_signal_length - signal_length))
        padded_signal = numpy.append(self.signal, zero_padding)
        
        indices = numpy.tile(numpy.arange(0, frame_length), (frame_amount, 1)) + numpy.tile(numpy.arange(0, frame_amount * frame_step, frame_step), (frame_length, 1)).T
        signal_frames = padded_signal[indices.astype(numpy.int32, copy=False)]
        
        hamming_window = 0.54 - 0.46 * numpy.cos(2*numpy.pi * numpy.arange(frame_length) / (frame_length - 1))
        self.framed_signal = signal_frames * numpy.hamming(frame_length)
    
    def fourier_transform(self):
        fft_magnitude = numpy.abs(numpy.fft.rfft(self.framed_signal, self.fft_size)) # Calculate FFT magnitude
        normalized = fft_magnitude / fft_magnitude.max() # Normalize to max 1
        logarithmized = numpy.log10(normalized) # Take logarithm of the FFT
        logarithmized[logarithmized < noise_factor] = noise_factor # Denoise the spectrogram
        return logarithmized
    
    def power_spectrum(self):
        self.spectrogram = ((1.0 / self.fft_size) * (self.spectrogram ** 2))
    
    def filter_banks(self, filter_amount):
        minimum_frequency = 0
        maximum_frequency = self.hertz_to_mel(self.sample_rate / 2)
        mels = numpy.linspace(minimum_frequency, maximum_frequency, filter_amount + 2)
        bins = numpy.floor((self.fft_size + 1) * self.mel_to_hertz(mels) / self.sample_rate)
        
        filter_banks = numpy.zeros([filter_amount, (self.fft_size // 2) + 1])
        for m in range(0, filter_amount):
            left_equation = int(bins[m])
            center = int(bins[m+1])
            right_equation = int(bins[m+2])
            
            for k in range(left_equation, center):
                filter_banks[m, k] = (k - bins[m]) / (bins[m+1] - bins[m])
            for k in range(center, right_equation):
                filter_banks[m, k] = (bins[m+2] - k) / (bins[m+2] - bins[m+1])
    
        mel_filters = filter_banks.T / filter_banks.sum(axis=1)

        filtered_spectrogram = numpy.transpose(mel_filters).dot(numpy.transpose(self.spectrogram))
        filtered_spectrogram = numpy.where(filtered_spectrogram == 0, numpy.finfo(float).eps, filtered_spectrogram)
        decibels = 20*numpy.log10(filtered_spectrogram)
        self.mel_spectrogram = decibels - (numpy.mean(decibels, axis=0) + 1e-8)
        return filtered_spectrogram

In [3]:
def wavfile_generator(filepath):
    for wavfile in open(filepath):
        yield wavfile
        
def linear_pipeline(converter, wavfile):
    converter.load_wav(wavfile)
    converter.pre_emphasis()
    converter.windowing()
    spectrogram = converter.fourier_transform()
    
    return numpy.rot90(spectrogram)

def pad_spectrogram(spectrogram):
    spectrogram = numpy.hstack((spectrogram, numpy.full((spectrogram.shape[0], numpy.abs(300 - spectrogram.shape[1])), fill_value=noise_factor)))
    return spectrogram

def split_spectrogram(spectrogram, wavname):
    spectrogram_amount = int(numpy.ceil(spectrogram.shape[1] / 300))
    if spectrogram_amount == 1:
        to_save = pad_spectrogram(spectrogram)
        if (numpy.mean(to_save) > noise_factor+0.02):
            save_spectrogram(to_save, wavname)
        else:
            spectrogram_amount -= 1
    else :
        for i in range(0, spectrogram_amount):
            if i != spectrogram_amount - 1:
                to_save = spectrogram[0:200, (i*300):((i+1)*300)]
                save_spectrogram(to_save, wavname+'-'+str(i))
            else:
                to_save = pad_spectrogram(spectrogram[0:200, (i*300):])
                if (numpy.mean(to_save) < noise_factor+0.02):
                    spectrogram_amount -= 1
                    break
                save_spectrogram(to_save, wavname+'-'+str(i))
    return spectrogram_amount
            

def save_spectrogram(spectrogram, spectrogram_name):
    fig = plt.figure()
    fig.set_size_inches((3, 2))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    plt.set_cmap('gray')
    ax.imshow(spectrogram, aspect='equal')
    plt.savefig('spectrograms/' + spectrogram_name + '.png', dpi=100)
    plt.close()

## Convert the spectrograms - Linear pipeline

In [100]:
generator = wavfile_generator('sound_files.txt')
label_file = open('label_counts.txt', 'w')

for wavfile in generator:
    converter = SpectrogramConverter(frame_size=0.02, frame_stride=0.01, fft_size=400)
    session_number = wavfile.split(' ')[0]
    emotion_label = wavfile.split(' ')[2][:-1]
    session_gender = '_'.join(wavfile.split(' ')[1].split('_')[0:-1])
    wav_file = wavfile.split(' ')[1]
    final_path = wav_folder_path.format(session_number, session_gender, wav_file)

    spectrogram = linear_pipeline(converter, final_path)
    label_amount = split_spectrogram(spectrogram, wav_file)
    if (label_amount > 0):
        label_file.write(wav_file + ' ' + emotion_label + ' ' + str(label_amount) + '\n')


In [4]:
file = open('label_counts.txt', 'r')
counts = reduce(op.add, list(map(lambda x: int(x.split(' ')[2][:-1]), open('label_counts.txt'))))
print(counts)

3356


## Save to a npz file

In [45]:
spectrogram_path = os.path.abspath('./spectrograms/*.png')
spectrogram_list = list(map(lambda x: x.split('\n')[0].split(' '), open('sound_files.txt')))
spectrogram_dict = {x[1]: x[2] for x in spectrogram_list}
spectrogram_array = []
label_array = []

for spectrogram in glob.glob(spectrogram_path):
    spectrogram_name = spectrogram.split('-')[0].split('.')[0].split('\\')[-1]
    label_array.append(spectrogram_dict[spectrogram_name])
    image = numpy.array(Image.open(spectrogram).convert('L'))
    spectrogram_array.append(image)
    

numpy.savez('iemocap_linear_dataset.npz', spectrograms=numpy.asarray(spectrogram_array), labels=numpy.asarray(label_array))

In [55]:
dataset = numpy.load('iemocap_linear_dataset.npz')

(3557,)