# Tool for analysing the training records in `data/synth_out`

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import h5py
import scipy.interpolate
import scipy.io.wavfile
import os

import gym_speech_resynthesis.envs.audio as audio

In [None]:
class TrainingRecord:
    def __init__(self, **entries):
        self.__dict__.update(entries)

def norm_mfccs(x):
    return x

def norm_spectrum(x):
    return x

In [None]:
fn = '/home/andreas/Uni/PhD/courses/cs_885_reinforcement_learning/project/code/data/experiments_out/selection/trpo_chunk_064_b057bbe671cd7e29_15241.h5'
_, audio = scipy.io.wavfile.read(fn[:-8] + 'orig.wav')
with h5py.File(fn, 'r') as f:
    TR = TrainingRecord(
        mfcc_synth=np.array(f['mfcc_synth']),
        spectrum_synth=np.array(f['spectrum_synth']),
        mfcc_orig=np.array(f['mfcc_orig']),
        spectrum_orig=np.array(f['spectrum_orig']),
        samples=np.array(f['samples']),
        action=np.array(f['action']),
        reward=np.array(f['reward']),
    )

In [None]:
N = TR.mfcc_synth.shape[0]
r_sim_m = np.zeros((N))
s_synth, s_orig = np.zeros((2, N, 40))
m_synth, m_orig = np.zeros((2, N, 12))
for i in range(N):
    s_synth[i] = norm_spectrum(TR.spectrum_synth[i])
    s_orig[i] = norm_spectrum(TR.spectrum_orig[i])

    m_synth[i] = norm_mfccs(TR.mfcc_synth[i])
    m_orig[i] = norm_mfccs(TR.mfcc_orig[i])
    r_sim_m[i] = -np.sqrt(np.mean((m_synth[i] - m_orig[i])**2))

In [None]:
def norm_img(x):
    return x / np.maximum(1, np.max(np.abs(x), axis=0))

fig, axs = plt.subplots(5, 1, figsize=(4.5, 2.25), sharex=True)


smpls = np.concatenate(TR.samples)

ts = np.linspace(0, 8e-3 * TR.reward.shape[0], TR.reward.shape[0])
ts_act = np.linspace(0, 8e-3 * TR.reward.shape[0], TR.action.shape[0])
ts_smpls = np.linspace(0, smpls.shape[0] / 16000, smpls.shape[0])

audio = audio[:smpls.shape[0]]

axs[0].plot(ts_smpls, audio / (1 << 15), linewidth=1)
axs[0].plot(ts_smpls, smpls / (1 << 15), linewidth=1)
axs[0].set_xlim(0, TR.samples.size)
axs[0].set_ylim(-1, 1)

axs[1].imshow(norm_img(m_synth.T), origin='lower', vmin=-1, vmax=1, extent=(0, np.max(ts), 0, 12))
axs[1].set_aspect('auto')

axs[2].imshow(norm_img(m_orig.T), origin='lower', vmin=-1, vmax=1, extent=(0, np.max(ts), 0, 12))
axs[2].set_aspect('auto')

axs[3].plot(ts, TR.reward, linewidth=2)
axs[3].set_xlim(0, np.max(ts))

axs[4].plot(ts_act, TR.action);
axs[4].set_xlim(0, np.max(ts))
axs[4].set_xlabel('Time (s)')

fig.savefig('../doc/media/results_trpo_.pdf', bbox_inches='tight')

In [None]:
with audio.Player(channels=1, sample_rate=16000) as player:
    player.write(np.concatenate(TR.samples / (1 << 15)).reshape(-1, 1))

In [None]:
sims = []
def sim(x, y):
    ts0 = np.linspace(0, 1, len(x))
    ipol = scipy.interpolate.interp1d(ts0, x, bounds_error=False, fill_value=0.0)
    maxsim = -np.inf
    sims = np.zeros((2, 20))
    for i, f in enumerate(np.logspace(-2, 2, 20, base=2)):
        ts1 = np.linspace(0, f, len(x))
        xp = ipol(ts1)
        sims[0, i] = f
        sims[1, i] = xp @ y
    return sims

In [None]:
s1 = norm_spectrum(TR.spectrum_orig[220])
s2 = norm_spectrum(TR.spectrum_synth[220])

fig, ax = plt.subplots()
ax.plot(s1)
ax.plot(s2)

fig, ax = plt.subplots()
sims = sim(s1, s2)
ax.plot(sims[0], sims[1])