In [None]:
import torch

# Utilities for loading and working with datasets.
import torch.utils
import torch.utils.data

# Neural network components.
import torch.nn
import torch.nn.functional

# Helpers to work with pytorch models.
!pip install torchinfo
import torchinfo

# Various other helpers.
import numpy as np
import numpy.fft
import matplotlib.pyplot as plt
import scipy.io.wavfile
from IPython.display import Audio



In [None]:
"""
A very simple Unidirectional RNN based frequency isolation model.
As input, this takes the following:
  1. Current timestep's bucketed input frequencies (F buckets)
  2. Bucketed input frequencies for P previous timesteps.
  3. Bucketed input frequencies for N next timesteps.
"""
class BasicUnidirectionalRnnIsolationNet(torch.nn.Module):
  def __init__(self,
               # Number of buckets input frequencies will be quantized to.
               frequency_buckets: int,
               # Number of previous timestep's worth of input the network gets.
               previous_timesteps: int,
               # Number of future timestep's worth of input the network gets.
               future_timesteps: int):
    super(BasicUnidirectionalRnnIsolationNet, self).__init__()

    # For this model, we get a vector of N frequency values per timestep.
    self.frequencies_per_timestep = frequency_buckets

    self.previous_timesteps = previous_timesteps
    self.future_timesteps = future_timesteps

    # We get multiple timesteps of input since audio data
    # is inherently temporal, and context matters.
    input_timesteps = 1 + self.previous_timesteps + self.future_timesteps

    # TODO: in future models, should expand inputs beyond just this.
    input_features = input_timesteps * self.frequencies_per_timestep

    # Number of features that will be passed between time-steps at hidden
    # level. TODO: tune
    self.hidden_features_per_timestep = 3 * frequency_buckets

    # Number of RNN layers to use.
    self.rnn_layers = 2

    self.rnn = torch.nn.RNN(
        # Number of inputs to expect at each timestep.
        input_size=input_features,
        # Number of features to carry through between timesteps using
        # hidden state. TODO: this is arbitrary and must be tuned.
        hidden_size=self.hidden_features_per_timestep,
        # Number of RNN layers to apply. TODO: tune.
        num_layers=self.rnn_layers,
        batch_first=True)
    
    # Fully connected classifier to bring RNN down to desired # outputs.
    self.fc1 = torch.nn.Linear(
        self.hidden_features_per_timestep, 2*frequency_buckets)
    
    self.fc2 = torch.nn.Linear(
        2*frequency_buckets, self.frequencies_per_timestep)
  
  """
  Forward inference.
  Args:
    x: input vector of dimensions (BATCHES, SEQ LENGTH, FREQS)
    h0: initial hidden state of dimensions (HIDDEN LAYERS, BATCHES, HIDDEN STATE)
  """
  def forward(self, x, h0 = None):
    if h0 is None:
      h0 = torch.zeros(
          (self.rnn_layers, len(x), self.hidden_features_per_timestep)).cuda()
    out, hidden = self.rnn(x, h0)

    out = torch.nn.functional.relu(
        self.fc1(out)
    )

    out = self.fc2(out)

    return out, hidden

freqs_per_step = 256
prev = 1
future = 1

b = BasicUnidirectionalRnnIsolationNet(freqs_per_step, prev, future)
torchinfo.summary(b, [(1, 1, freqs_per_step * (1 + prev + future)), (2, 1, 3*freqs_per_step)])

# Experiment 1: Same input, Same output
See if we can get this basic RNN to just 'forward' input to output.
1. Load in an audio file.
2. Split it into 100 sample chunks (0 padded at end) (Audio is about 80k samples, so this is about 800 slices)
3. Run through the RNN using Adam and MSE Loss, trying to make input = output.
4. Achieves convergence (loss from ~23M to ~100 in about 50k epochs)

After the above, it is audibly the same...but has heavy distortion. Despite 20k more iterations, not dropping below an MSE of about 50. Will try more with a lower LR (0.0001). 

10k more iterations at 0.0001 brought us down to ~35 MSE. Another round following at 0.00006

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
sample_rate, data = scipy.io.wavfile.read("drive/MyDrive/APS360 Group Project/Experiments/RNN Input-Forwarding/helloworld.wav")
num_samples = len(data)

samples_per_chunk = 100
padding = samples_per_chunk - (num_samples % samples_per_chunk)
padded_data = np.pad(data, (0, padding))

# Reshape data into chunks. Each row is a separate chunk (timestep),
# Columns are each step within that chunk.
chunks = padded_data.reshape(-1, samples_per_chunk)

# Convert chunks to frequency domain.
# These frequencies are *independent* for each chunk, and do not bleed between.
chunks_fft = np.fft.fft(chunks)
chunks_fft_tensor = torch.from_numpy(chunks_fft)

# Normally the first dimension is reserved for batches.
# Since we only have one, just add a dim.
chunks_fft_tensor = chunks_fft_tensor.unsqueeze(0).float().cuda()

# b = BasicUnidirectionalRnnIsolationNet(
#     frequency_buckets=100, previous_timesteps=0, future_timesteps=0)
# b.cuda()

# Try to train simple parroting.
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(b.parameters(), lr=0.000001)

for epoch in range(30_000):
  optimizer.zero_grad()

  # Forward step.
  logits, hn = b(chunks_fft_tensor)

  # Output should be same as input.
  loss = criterion(logits, chunks_fft_tensor)

  if epoch % 100 == 0:
    print(epoch, loss)

  loss.backward()
  optimizer.step()

In [None]:
torch.save(b.state_dict(), "drive/MyDrive/APS360 Group Project/Experiments/RNN Input-Forwarding/final.pt")

In [None]:
# See how well model parrots input.
b.load_state_dict(torch.load("drive/MyDrive/APS360 Group Project/Experiments/RNN Input-Forwarding/final.pt"))
output_freqs, _ = b(chunks_fft_tensor)
reconstructed_audio = np.fft.ifft(output_freqs.cpu().detach())
reconstructed_audio = reconstructed_audio.reshape(-1)

Audio(data=reconstructed_audio, rate=sample_rate)


  data = np.array(data, dtype=float)


# Analysis
We can hear that the audio is understandable, but heavily distorted. Without further analysis, two possible reasons come to mind.

1. Vanishing gradients mean there is some random noise that's hard to extract.

2. The fact that the model was only trained on this one clip (for testing purposes) means that it is just memorizing the output. That is, it may not even be passing input -> output, they may be uncorrelated.


Still, this experiment has accomplished its primary objective of being a first look at audio in RNNs. Some useful takeaways:

- RNN specification in pytorch, and the input/output formats.

- Audio chunking in numpy.