In [1]:
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch
from torch.nn import Module, Parameter
from torch import FloatTensor
from scipy import signal
import numpy as np
from torchaudio import transforms
import matplotlib.pyplot as plt
import IPython.display as ipd
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
dirname = os.path.abspath('')
rootdir = os.path.split(dirname)[0]

H1_TRAINING_INPUT_PATH = "".join([rootdir, "/data/train/ht1-input.wav"])
H1_TRAINING_TARGET_PATH = "".join([rootdir, "/data/train/ht1-target.wav"])

metadata = torchaudio.info(H1_TRAINING_INPUT_PATH)
print(metadata)


AudioMetaData(sample_rate=44100, num_frames=14994001, num_channels=1, bits_per_sample=16, encoding=PCM_S)


In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device=", device) 

device= cpu


In [3]:
train_input, fs = torchaudio.load(H1_TRAINING_INPUT_PATH)
train_target, fs = torchaudio.load(H1_TRAINING_TARGET_PATH)

# For Reproducibility

In [4]:
torch.manual_seed(0)

<torch._C.Generator at 0x7fa8c8179290>

## Initialize Dataloader

In [5]:
class DIIRDataSet(Dataset):
    def __init__(self, input, target, sequence_length):
        self.input = input
        self.target = target
        self._sequence_length = sequence_length
        self.input_sequence = self.wrap_to_sequences(self.input, self._sequence_length)
        self.target_sequence = self.wrap_to_sequences(self.target, self._sequence_length)
        self._len = self.input_sequence.shape[0]

    def __len__(self):
        return self._len

    def __getitem__(self, index):
        return {'input': self.input_sequence[index, :, :]
               ,'target': self.target_sequence[index, :, :]}

    def wrap_to_sequences(self, data, sequence_length):
        num_sequences = int(np.floor(data.shape[0] / sequence_length))
        print(num_sequences)
        truncated_data = data[0:(num_sequences * sequence_length)]
        wrapped_data = truncated_data.reshape((num_sequences, sequence_length, 1))
        wrapped_data = wrapped_data.permute(0,2,1)
        print(wrapped_data.shape)
        return np.float32(wrapped_data)


In [6]:
train_input.squeeze(0).shape

torch.Size([14994001])

In [7]:
batch_size = 512#1024
sequence_length = 512
train_dataset=DIIRDataSet(train_input.squeeze(0), train_target.squeeze(0), sequence_length)
loader = DataLoader(train_dataset, batch_size=batch_size, shuffle = False, pin_memory=True, drop_last=True) #? what does the shuffle really shuffles here?

29285
torch.Size([29285, 1, 512])
29285
torch.Size([29285, 1, 512])


In [8]:
len(loader)

57

# Declare Model

In [9]:
class FIRNN(Module):
    def __init__(self, n_input=1, n_output=1, kernel_size=80, n_channel=1):
        super(FIRNN, self).__init__()
        self.kernel_size = kernel_size
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=kernel_size, stride=1)

        self.fc1 = nn.Linear(in_features = 512, out_features = 512 * 2)
        self.fc2 = nn.Linear(in_features = 512 * 2, out_features = 512 + kernel_size -1)

        
        self.mlp_layer = nn.Sequential(
            self.fc1 ,
            nn.Tanh(),
            self.fc2,
        )

    def forward(self, x):
        bs = x.shape[0]
    
        # first get fir output
        x = F.pad(x, (self.kernel_size-1, 0)) #pad on the left side
        x = self.conv1(x) 
        x = self.mlp_layer(x)
        # Should I pad again here or should I make the output size output + padding?
        # where do we need to reverse weight array b? from what I read it doesn't seem to matter
#         x = F.pad(x, (self.kernel_size-1, 0)) #pad on the left side
        x = self.conv1(x)

        return x


In [10]:
model = FIRNN()

## Define optimizer and criterion

In [11]:
import torch.nn as nn
from torch.optim import Adam

n_epochs = 100
lr = 1e-3

optimizer = Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

criterion = nn.MSELoss()

# Define train loop

In [12]:
def train(criterion, model, loader, optimizer):
    model.train()
    device = next(model.parameters()).device
    total_loss = 0
    
    for ind, batch in enumerate(loader):
        input_seq_batch = batch['input'].to(device)
        target_seq_batch = batch['target'].to(device)
        optimizer.zero_grad()
        predicted_output = model(input_seq_batch)
#         print(f"ind: {ind} input: {input_seq_batch.shape}")
#         print(f"ind: {ind} target: {target_seq_batch.shape}")
#         print(predicted_output.shape) #[1024,512,1]
        loss = criterion(target_seq_batch, predicted_output)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    total_loss /= len(loader)
    return total_loss

## Train!

In [13]:
for epoch in range(n_epochs):
    loss = train(criterion, model, loader, optimizer)
    print("Epoch {} -- Loss {:3E}".format(epoch, loss))

Epoch 0 -- Loss 6.951830E-02
Epoch 1 -- Loss 6.129012E-02
Epoch 2 -- Loss 6.081847E-02
Epoch 3 -- Loss 6.031766E-02
Epoch 4 -- Loss 6.004531E-02
Epoch 5 -- Loss 5.971913E-02
Epoch 6 -- Loss 5.900174E-02
Epoch 7 -- Loss 5.810906E-02
Epoch 8 -- Loss 5.678380E-02
Epoch 9 -- Loss 5.491018E-02
Epoch 10 -- Loss 5.257869E-02
Epoch 11 -- Loss 5.034612E-02
Epoch 12 -- Loss 4.871935E-02
Epoch 13 -- Loss 4.761550E-02
Epoch 14 -- Loss 4.678325E-02
Epoch 15 -- Loss 4.614697E-02
Epoch 16 -- Loss 4.562637E-02
Epoch 17 -- Loss 4.513049E-02
Epoch 18 -- Loss 4.467812E-02
Epoch 19 -- Loss 4.424627E-02
Epoch 20 -- Loss 4.384907E-02
Epoch 21 -- Loss 4.345786E-02
Epoch 22 -- Loss 4.313323E-02
Epoch 23 -- Loss 4.281938E-02
Epoch 24 -- Loss 4.254709E-02
Epoch 25 -- Loss 4.228417E-02
Epoch 26 -- Loss 4.204542E-02
Epoch 27 -- Loss 4.181654E-02
Epoch 28 -- Loss 4.162163E-02
Epoch 29 -- Loss 4.141938E-02
Epoch 30 -- Loss 4.126325E-02
Epoch 31 -- Loss 4.108158E-02
Epoch 32 -- Loss 4.092292E-02
Epoch 33 -- Loss 4.0

# Evaluate

In [14]:
save_path = os.path.join('../models/conv_mlp_conv_no_second_padding.pth')
torch.save(model.state_dict(), save_path)

In [15]:
val_batch_size = 128
sequence_length = 512
val_dataset=DIIRDataSet(train_input.squeeze(0), train_target.squeeze(0), sequence_length)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle = False, pin_memory=True, drop_last=True)

29285
torch.Size([29285, 1, 512])
29285
torch.Size([29285, 1, 512])


In [16]:
def inspect_file(path):
    print("-" * 10)
    print("Source:", path)
    print("-" * 10)
    print(f" - File size: {os.path.getsize(path)} bytes")
    print(f" - {torchaudio.info(path)}")

In [17]:
def save_audio(batch):
    #1024,512,1
    out_batch = batch.detach().cpu()
    out_batch = out_batch.squeeze(-1).flatten()
    print(out_batch.shape)
    return out_batch

In [18]:
import soundfile as sf

out_path = '../output/'
sample_rate = 44100
save_tensor = torch.zeros(14994001,512)
with torch.no_grad():
    for i, val_batch in enumerate(val_loader):
        input_seq_batch = val_batch['input'].to(device)
        #target_seq_batch = val_batch['target'].to(device)
        predicted_output = model(input_seq_batch)
        output_tmp = predicted_output.squeeze().detach().cpu()
#         print(output_tmp.shape)
        save_tensor[i,:] = output_tmp
    
    print(save_tensor.shape)
    out_audio = save_audio(save_tensor)
    print(out_audio.shape)
    path = os.path.join(out_path, "target_.wav")
    print("Exporting {}".format(path))
    sf.write(path, out_audio, sample_rate,'PCM_24')
    #torchaudio.save(path, out_audio, sample_rate, encoding="PCM_S", bits_per_sample=16)
    inspect_file(path)
    

[W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.


torch.Size([14994001, 512])
torch.Size([7676928512])
torch.Size([7676928512])
Exporting ../output/target_.wav
----------
Source: ../output/target_.wav
----------
 - File size: 23030785580 bytes
 - AudioMetaData(sample_rate=44100, num_frames=518649685, num_channels=1, bits_per_sample=24, encoding=PCM_S)


In [19]:
train_input.shape

torch.Size([1, 14994001])

In [20]:
#small test on padding
t4d = torch.ones(3, 3, 4)
print(t4d.shape)
out = F.pad(t4d, (3,0)) #"constant", 0
print(out.shape)

torch.Size([3, 3, 4])
torch.Size([3, 3, 7])


In [21]:
t4d[1,1,:]

tensor([1., 1., 1., 1.])

In [24]:
from scipy.signal import filtfilt

In [26]:
train_arr = train_target.numpy().reshape(-1)