# Neural Audio LSTM Model

In [1]:
import os

import numpy as np
import soundfile as sf
import torch
import torch.nn as nn
import torchaudio
from torch.nn import Module
from torch.optim import Adam
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchaudio.functional import lfilter

# For Reproducibility
torch.manual_seed(0)

<torch._C.Generator at 0x7fca30199090>

### Define constants

In [2]:
DATASET = "ht1"  # change to ht1 or muff
TRAIN = True  # change to False to skip training
VERSION = "v1"  # increment

In [3]:
class NeuralAudioDataSet(Dataset):
    """
    Creates dataset object for training, evaluation, and prediction
    """

    def __init__(self, input, target, sequence_length):
        self.input = input
        self.target = target

        self._sequence_length = sequence_length
        self.input_sequence = self.wrap_to_sequences(self.input, self._sequence_length)
        self.target_sequence = self.wrap_to_sequences(
            self.target, self._sequence_length
        )
        self._len = self.input_sequence.shape[0]

    def __len__(self):
        return self._len

    def __getitem__(self, index):
        return {
            "input": self.input_sequence[index, :, :],
            "target": self.target_sequence[index, :, :],
        }

    def wrap_to_sequences(self, data, sequence_length):
        """
        Args:
            data: Either input or target signal
            sequence_length: Number of samples in training example

        Returns:
            wrapped_data: Data packed into a sequence tensor for LSTM
        """
        num_sequences = int(np.floor(data.shape[0] / sequence_length))
        truncated_data = data[0 : (num_sequences * sequence_length)]
        wrapped_data = truncated_data.reshape((num_sequences, sequence_length, 1))
        wrapped_data = wrapped_data.permute(0, 2, 1)

        return np.float32(wrapped_data)


### Define Model Architecture

In [4]:
class IIRNN(Module):
    """
    Defines an LSTM model for guitar distortion modeling

    Attributes:
        input_size: Number of channels in input signal
        output_size: Number of channels to predict for output signal
        hidden_size: Number of features for a single audio sample
    """

    def __init__(self, input_size=1, output_size=1, hidden_size=80):
        super(IIRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size

        self.lstm = nn.LSTM(
            input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True
        )

        self.fc1 = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x):
        """

        Args:
            x: Input signal from dataloader, [batch_size, input_size, sequence_length]

        Returns:
            x: Output signal, [batch_size, input_size, sequence_length]

        """

        x, hn = self.lstm(x.permute(0, 2, 1))

        x = self.fc1(x)

        x = x.permute(0, 2, 1)

        return x


### Define training loop

In [5]:
def train(criterion, model, loader, optimizer, is_train):

    if is_train:
        model.train(True)
    else:
        model.train(False)

    device = next(model.parameters()).device
    total_loss = 0

    for ind, batch in enumerate(loader):
        input_seq_batch = batch["input"].to(device)
        target_seq_batch = batch["target"].to(device)

        optimizer.zero_grad()
        predicted_output = model(input_seq_batch)

        # Apply pre-emphasis filter to minimize loss in important frequency range
        target_seq_batch_filt = lfilter(
            target_seq_batch, torch.Tensor([1, 0]), torch.Tensor([1, -0.95])
        )
        predicted_output_filt = lfilter(
            predicted_output, torch.Tensor([1, 0]), torch.Tensor([1, -0.95])
        )

        loss = criterion(target_seq_batch_filt, predicted_output_filt)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # print(f"Loss: {loss}")

    total_loss /= len(loader)

    return total_loss

In [6]:
def inspect_file(path):
    print("-" * 10)
    print("Source:", path)
    print("-" * 10)
    print(f" - File size: {os.path.getsize(path)} bytes")
    print(f" - {torchaudio.info(path)}")


# Save audio files
def save_audio(batch):
    out_batch = batch.detach().cpu()
    out_batch = out_batch.squeeze(-1).flatten()
    print(out_batch.shape)
    return out_batch

### Read Data

In [7]:
dirname = os.path.abspath("")
rootdir = os.path.split(dirname)[0]

TRAINING_INPUT_PATH = "".join([rootdir, f"/data/train/{DATASET}-input.wav"])
TRAINING_TARGET_PATH = "".join([rootdir, f"/data/train/{DATASET}-target.wav"])
VAL_INPUT_PATH = "".join([rootdir, f"/data/val/{DATASET}-input.wav"])
VAL_TARGET_PATH = "".join([rootdir, f"/data/val/{DATASET}-input.wav"])

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device=", device)
train_input, input_fs = torchaudio.load(TRAINING_INPUT_PATH)
train_target, target_fs = torchaudio.load(TRAINING_TARGET_PATH)

val_input, _ = torchaudio.load(VAL_INPUT_PATH)
val_target, _ = torchaudio.load(VAL_TARGET_PATH)

assert input_fs == target_fs

device= cpu


### Train

In [8]:
# Define dataloader for training data
batch_size = 1024
sequence_length = 1024
train_dataset = NeuralAudioDataSet(
    train_input.squeeze(0), train_target.squeeze(0), sequence_length
)
loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=False,
    pin_memory=True,
    drop_last=True,
)

val_dataset = NeuralAudioDataSet(
    val_input.squeeze(0), val_target.squeeze(0), sequence_length
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    pin_memory=True,
    drop_last=True,
)

if TRAIN:
    # Define optimizer, model, and criterion for training
    model = IIRNN()
    n_epochs = 100
    lr = 1e-3

    optimizer = Adam(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=0,
        amsgrad=False,
    )
    criterion = nn.MSELoss()

    # Run training loop
    print("Training started\n")
    train_losses = []
    val_losses = []
    for epoch in range(n_epochs):
        train_loss = train(criterion, model, loader, optimizer, True)
        val_loss = train(criterion, model, val_loader, optimizer, False)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f'Epoch {epoch +1} LOSS train {train_loss} valid {val_loss}')

    # Save model
    save_path = os.path.join(
        f"../models/lstm-{DATASET}-{VERSION}"
    )
    torch.save(model.state_dict(), save_path)
else:
    load_path = os.path.join(
        f"../models/lstm-{DATASET}-{VERSION}"
    )

    model = IIRNN()
    model.load_state_dict(torch.load(load_path))
    model.eval()

Training started

Epoch 1 LOSS train 0.004995518347381481 valid 3.2538740924792364e-05


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/bryn/opt/anaconda3/envs/neural-audio/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/my/mnzqnvhx1ks730z08hx2z6v80000gn/T/ipykernel_99313/851332890.py", line 47, in <module>
    train_loss = train(criterion, model, loader, optimizer, True)
  File "/var/folders/my/mnzqnvhx1ks730z08hx2z6v80000gn/T/ipykernel_99313/1179480426.py", line 16, in train
    predicted_output = model(input_seq_batch)
  File "/Users/bryn/opt/anaconda3/envs/neural-audio/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/my/mnzqnvhx1ks730z08hx2z6v80000gn/T/ipykernel_99313/938616186.py", line 34, in forward
    x, hn = self.lstm(x.permute(0, 2, 1))
  File "/Users/bryn/opt/anaconda3/envs/neural-audio/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110,

TypeError: object of type 'NoneType' has no len()

### Apply and save model, write audio

In [None]:
# Write audio output
out_path = "../output/audio"
sample_rate = 44100
save_tensor = torch.zeros(train_input.shape[1] // sequence_length, 1024)
with torch.no_grad():
    for i, batch in enumerate(loader):
        input_seq_batch = batch["input"].to(device)
        predicted_output = model(input_seq_batch)
        output_tmp = predicted_output.squeeze().detach().cpu()
        save_tensor[i, :] = output_tmp

    out_audio = save_audio(save_tensor.view(-1,1))
    path = os.path.join(out_path, f"lstm-{DATASET}-{VERSION}.wav")
    print("Exporting {}".format(path))
    sf.write(path, out_audio, sample_rate,'PCM_24')
    # out_audio = out_audio.view(1,-1)
    # torchaudio.save(
    #     path, out_audio, sample_rate, encoding="PCM_S", bits_per_sample=16
    # )
    inspect_file(path)