In [1]:
import time
import pickle
import warnings
import gc
import copy
import numpy as np
import torch
import torch.nn as nn
import torchaudio
from tqdm import tqdm, tqdm_notebook
from torch.utils.data import Dataset, DataLoader
from matplotlib import colors, pyplot as plt
from IPython.display import clear_output
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import librosa
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
print(torch.__version__)
print(torchaudio.__version__)

2.1.0+cu118
2.1.0+cu118


In [4]:
!cp -r /content/drive/MyDrive/DL_Project/Clean /content/clean ./

!cp -r /content/drive/MyDrive/DL_Project/Noisy  /content/noisy ./

cp: cannot stat '/content/clean': No such file or directory
cp: cannot stat '/content/noisy': No such file or directory


In [6]:
!unzip /content/Clean/clean_trainset_wav.zip
!unzip /content/Noisy/noisy_trainset_wav.zip

In [13]:
!unzip /content/Clean/clean_testset_wav.zip
!unzip /content/Noisy/noisy_testset_wav.zip

Archive:  /content/Clean/clean_testset_wav.zip
   creating: clean_testset_wav/
  inflating: clean_testset_wav/p232_001.wav  
  inflating: clean_testset_wav/p232_002.wav  
  inflating: clean_testset_wav/p232_003.wav  
  inflating: clean_testset_wav/p232_005.wav  
  inflating: clean_testset_wav/p232_006.wav  
  inflating: clean_testset_wav/p232_007.wav  
  inflating: clean_testset_wav/p232_009.wav  
  inflating: clean_testset_wav/p232_010.wav  
  inflating: clean_testset_wav/p232_011.wav  
  inflating: clean_testset_wav/p232_012.wav  
  inflating: clean_testset_wav/p232_013.wav  
  inflating: clean_testset_wav/p232_014.wav  
  inflating: clean_testset_wav/p232_015.wav  
  inflating: clean_testset_wav/p232_016.wav  
  inflating: clean_testset_wav/p232_017.wav  
  inflating: clean_testset_wav/p232_019.wav  
  inflating: clean_testset_wav/p232_020.wav  
  inflating: clean_testset_wav/p232_021.wav  
  inflating: clean_testset_wav/p232_022.wav  
  inflating: clean_testset_wav/p232_023.wav  
 

In [7]:
from pathlib import Path

In [8]:
SAMPLE_RATE = 48000
N_FFT = (SAMPLE_RATE * 64) // 1000
HOP_LENGTH = (SAMPLE_RATE * 16) // 1000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Define your custom dataset class
def signal2pytorch(x):
    #Function to convert a signal vector x, like a mono audio signal, into a 3-d Tensor that conv1d of Pytorch expects,
    X = np.expand_dims(x, axis=0)
    if len(x.shape)==1: #mono:
        X = np.expand_dims(X, axis=0)
    X=torch.from_numpy(X)
    X=X.type(torch.Tensor)
    X=X.permute(1,0,2)
    return X

In [11]:
class SpeechDataset(Dataset):
    def __init__(self, noisy_files, clean_files):
        super().__init__()
        # list of files
        self.noisy_files = sorted(noisy_files)
        self.clean_files = sorted(clean_files)

        self.len_ = len(self.noisy_files)

        # fixed len
        self.max_len = 165000


    def __len__(self):
        return self.len_

    def load_sample(self, file):
        waveform, _ = torchaudio.load(file)
        return waveform

    def __getitem__(self, index):
        # load to tensors and normalization
        clean_audio = self.load_sample(self.clean_files[index])
        noisy_audio = self.load_sample(self.noisy_files[index])
        clean_audio = self._prepare_sample(clean_audio)
        noisy_audio = self._prepare_sample(noisy_audio)
        clean_audio = signal2pytorch(clean_audio).to(device)
        noisy_audio = signal2pytorch(noisy_audio).to(device)
        return noisy_audio, clean_audio

    def _prepare_sample(self, waveform):
        waveform = waveform.numpy()
        current_len = waveform.shape[1]

        output = np.zeros((1, self.max_len), dtype='float32')
        output[0, -current_len:] = waveform[0, :self.max_len]
        output = torch.from_numpy(output)

        return output




In [14]:
TRAIN_INPUT_DIR = Path('/content/noisy_trainset_wav')
TRAIN_TARGET_DIR = Path('/content/clean_trainset_wav')


TEST_INPUT_DIR = Path('/content/noisy_testset_wav')
TEST_TARGET_DIR = Path('/content/clean_testset_wav')

train_input_files = sorted(list(TRAIN_INPUT_DIR.rglob('*.wav')))
train_target_files = sorted(list(TRAIN_TARGET_DIR.rglob('*.wav')))
print(train_input_files)
print(train_target_files)


test_input_files = sorted(list(TEST_INPUT_DIR.rglob('*.wav')))
test_target_files = sorted(list(TEST_TARGET_DIR.rglob('*.wav')))

print("No. of Training files:",len(train_input_files))
# print("No. of Testing files:",len(test_noisy_files))
train_dataset = SpeechDataset(train_input_files, train_target_files)
train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)

test_dataset = SpeechDataset(test_input_files, test_target_files)
test_loader = DataLoader(test_dataset, batch_size=30, shuffle=True)

[PosixPath('/content/noisy_testset_wav/p232_001.wav'), PosixPath('/content/noisy_testset_wav/p232_002.wav'), PosixPath('/content/noisy_testset_wav/p232_003.wav'), PosixPath('/content/noisy_testset_wav/p232_005.wav'), PosixPath('/content/noisy_testset_wav/p232_006.wav'), PosixPath('/content/noisy_testset_wav/p232_007.wav'), PosixPath('/content/noisy_testset_wav/p232_009.wav'), PosixPath('/content/noisy_testset_wav/p232_010.wav'), PosixPath('/content/noisy_testset_wav/p232_011.wav'), PosixPath('/content/noisy_testset_wav/p232_012.wav'), PosixPath('/content/noisy_testset_wav/p232_013.wav'), PosixPath('/content/noisy_testset_wav/p232_014.wav'), PosixPath('/content/noisy_testset_wav/p232_015.wav'), PosixPath('/content/noisy_testset_wav/p232_016.wav'), PosixPath('/content/noisy_testset_wav/p232_017.wav'), PosixPath('/content/noisy_testset_wav/p232_019.wav'), PosixPath('/content/noisy_testset_wav/p232_020.wav'), PosixPath('/content/noisy_testset_wav/p232_021.wav'), PosixPath('/content/noisy_t

In [None]:
# import IPython.display as ipd
#TESING PUPOSES
# for batch_idx, (noisy_audio, clean_audio) in enumerate(train_loader):
#   print("NS",noisy_audio.shape)
#   print("CS",clean_audio.shape)
#   temp_audio = noisy_audio[0]
#   temp_audio=np.array(temp_audio.cpu())
#   xrek_noisy=temp_audio[:,0,:]
#   print("xrek_noisy ",xrek_noisy)
#   display(ipd.Audio(xrek_noisy, rate=48000));
#   xrek_clean=clean_audio[0][:,0,:]
#   print("xrek_clean ",xrek_clean)
#   display(ipd.Audio(xrek_clean.cpu(), rate=48000));
#   break

In [15]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):

        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim=-1))
        output = torch.matmul(attn, v)

        return output, attn


class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)


    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        residual = q
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        if mask is not None:
            mask = mask.unsqueeze(1)
        q, attn = self.attention(q, k, v, mask=mask)
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        q = self.dropout(self.fc(q))
        q = q + residual
        q = self.layer_norm(q)
        return q, attn

In [16]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        residual = x
        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)
        x = residual
        x = self.layer_norm(x)
        return x

In [17]:
def get_subsequent_mask(seq):
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask

In [18]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_hid, n_position=200):
        super(PositionalEncoding, self).__init__()
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))

    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        def get_position_angle_vec(position):
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

        return torch.FloatTensor(sinusoid_table).unsqueeze(0)

    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()

In [19]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.0):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        enc_output = self.pos_ffn(enc_output)
        return enc_output, enc_slf_attn

In [20]:
class TransformerEncoder(nn.Module):
    def __init__(
            self, d_word_vec=512, n_layers=2, n_head=8, d_k=64, d_v=64,
            d_model=512, d_inner=2048, dropout=0.1, n_position=624, scale_emb=False):

        super().__init__()

        if n_position > 0:
            self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position)
        else:
            self.position_enc = lambda x: x
        self.dropout = nn.Dropout(p=dropout)
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, src_seq, src_mask, return_attns=False):

        enc_slf_attn_list = []
        enc_output = src_seq
        if self.scale_emb:
            enc_output *= self.d_model ** 0.5
        enc_output = self.dropout(self.position_enc(enc_output))
        enc_output = self.layer_norm(enc_output)

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
            enc_slf_attn_list = enc_slf_attn_list+[enc_slf_attn] if return_attns else []

        if return_attns:
            return enc_output, enc_slf_attn_list
        return enc_output

In [21]:
def weight_scaling_init(layer):
    w = layer.weight.detach()
    alpha = 10.0 * w.std()
    layer.weight.data /= torch.sqrt(alpha)
    layer.bias.data /= torch.sqrt(alpha)

In [23]:
def padding(x, D, K, S):
    L = x.shape[-1]
    for _ in range(D):
        if L < K:
            L = 1
        else:
            L = 1 + np.ceil((L - K) / S)

    for _ in range(D):
        L = (L - 1) * S + K

    L = int(L)
    x = F.pad(x, (0, L - x.shape[-1]))
    return x


class UNet_attention(nn.Module):
    def __init__(self, channels_input=1, channels_output=1,
                 channels_H=64, max_H=768,
                 encoder_n_layers=8, kernel_size=4, stride=2,
                 tsfm_n_layers=3,
                 tsfm_n_head=8,
                 tsfm_d_model=512,
                 tsfm_d_inner=2048):

        super(UNet_attention, self).__init__()

        self.channels_input = channels_input
        self.channels_output = channels_output
        self.channels_H = channels_H
        self.max_H = max_H
        self.encoder_n_layers = encoder_n_layers
        self.kernel_size = kernel_size
        self.stride = stride

        self.tsfm_n_layers = tsfm_n_layers
        self.tsfm_n_head = tsfm_n_head
        self.tsfm_d_model = tsfm_d_model
        self.tsfm_d_inner = tsfm_d_inner

        # encoder and decoder
        self.encoder = nn.ModuleList()
        self.decoder = nn.ModuleList()

        for i in range(encoder_n_layers):
            self.encoder.append(nn.Sequential(
                nn.Conv1d(channels_input, channels_H, kernel_size, stride),
                nn.ReLU(),
                nn.Conv1d(channels_H, channels_H * 2, 1),
                nn.GLU(dim=1)
            ))
            channels_input = channels_H

            if i == 0:
                # no relu at end
                self.decoder.append(nn.Sequential(
                    nn.Conv1d(channels_H, channels_H * 2, 1),
                    nn.GLU(dim=1),
                    nn.ConvTranspose1d(channels_H, channels_output, kernel_size, stride)
                ))
            else:
                self.decoder.insert(0, nn.Sequential(
                    nn.Conv1d(channels_H, channels_H * 2, 1),
                    nn.GLU(dim=1),
                    nn.ConvTranspose1d(channels_H, channels_output, kernel_size, stride),
                    nn.ReLU()
                ))
            channels_output = channels_H

            # double H but keep below max_H
            channels_H *= 2
            channels_H = min(channels_H, max_H)

        # self attention block
        self.tsfm_conv1 = nn.Conv1d(channels_output, tsfm_d_model, kernel_size=1)
        self.tsfm_encoder = TransformerEncoder(d_word_vec=tsfm_d_model,
                                               n_layers=tsfm_n_layers,
                                               n_head=tsfm_n_head,
                                               d_k=tsfm_d_model // tsfm_n_head,
                                               d_v=tsfm_d_model // tsfm_n_head,
                                               d_model=tsfm_d_model,
                                               d_inner=tsfm_d_inner,
                                               dropout=0.0,
                                               n_position=0,
                                               scale_emb=False)
        self.tsfm_conv2 = nn.Conv1d(tsfm_d_model, channels_output, kernel_size=1)

        # weight scaling initialization
        for layer in self.modules():
            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
                weight_scaling_init(layer)

    def forward(self, noisy_audio):
        # (B, L) -> (B, C, L)
        if len(noisy_audio.shape) == 2:
            noisy_audio = noisy_audio.unsqueeze(1)
        B, C, L = noisy_audio.shape
        assert C == 1

        # normalization and padding
        std = noisy_audio.std(dim=2, keepdim=True) + 1e-3
        noisy_audio /= std
        x = padding(noisy_audio, self.encoder_n_layers, self.kernel_size, self.stride)

        # encoder
        skip_connections = []
        for downsampling_block in self.encoder:
            x = downsampling_block(x)
            skip_connections.append(x)
        skip_connections = skip_connections[::-1]

        len_s = x.shape[-1]  # length at bottleneck
        attn_mask = (1 - torch.triu(torch.ones((1, len_s, len_s), device=x.device), diagonal=1)).bool()

        x = self.tsfm_conv1(x)  # C 1024 -> 512
        x = x.permute(0, 2, 1)
        x = self.tsfm_encoder(x, src_mask=attn_mask)
        x = x.permute(0, 2, 1)
        x = self.tsfm_conv2(x)  # C 512 -> 1024

        # decoder
        for i, upsampling_block in enumerate(self.decoder):
            skip_i = skip_connections[i]
            x = x + skip_i[:, :, :x.shape[-1]]
            x = upsampling_block(x)

        x = x[:, :, :L] * std
        return x

In [24]:
network_config = {
        "channels_input": 1,
        "channels_output": 1,
        "channels_H": 64,
        "max_H": 768,
        "encoder_n_layers": 8,
        "kernel_size": 4,
        "stride": 2,
        "tsfm_n_layers": 5,
        "tsfm_n_head": 8,
        "tsfm_d_model": 512,
        "tsfm_d_inner": 2048
    }
model = UNet_attention(**network_config).cuda()

In [25]:
def apply_reduction(losses, reduction="none"):
    if reduction == "mean":
        losses = losses.mean()
    elif reduction == "sum":
        losses = losses.sum()
    return losses

class SNRLoss(torch.nn.Module):

    def __init__(self, zero_mean=True, eps=1e-8, reduction="mean"):
        super(SNRLoss, self).__init__()
        self.zero_mean = zero_mean
        self.eps = eps
        self.reduction = reduction

    def forward(self, input, target):
        if self.zero_mean:
            input_mean = torch.mean(input, dim=-1, keepdim=True)
            target_mean = torch.mean(target, dim=-1, keepdim=True)
            input = input - input_mean
            target = target - target_mean

        res = input - target
        # print("res ",res)
        losses = 10 * torch.log10(
            (target ** 2).sum(-1) / ((res ** 2).sum(-1) + self.eps) + self.eps
        )
        # print("lossess", losses)
        losses = apply_reduction(losses, self.reduction)
        return -losses

In [26]:
!pip install auraloss

Collecting auraloss
  Downloading auraloss-0.4.0-py3-none-any.whl (16 kB)
Installing collected packages: auraloss
Successfully installed auraloss-0.4.0


# Use either of the two loss function

In [31]:
# import auraloss
# model = UNet_attention(**network_config).cuda()
# loss_fn = auraloss.time.LogCoshLoss()

In [30]:
model = UNet_attention(**network_config).cuda()
loss_fn = SNRLoss()
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#, betas=(0.9, 0.999))


In [28]:
# torch.save(model, "/content/drive/MyDrive/DL_Project/clean_unet_clean_snr_full_ft/full_model.pt")
# torch.save(model, f"/content/drive/MyDrive/DL_Project/clean_unet_clean_snr_full_ft/full_model.pt")


In [29]:
# pre load models
model.load_state_dict(torch.load("/content/drive/MyDrive/DL_Project/clean_unet_clean_snr_full_ft/999.pth"))
model.train()

UNet_attention(
  (encoder): ModuleList(
    (0): Sequential(
      (0): Conv1d(1, 64, kernel_size=(4,), stride=(2,))
      (1): ReLU()
      (2): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
      (3): GLU(dim=1)
    )
    (1): Sequential(
      (0): Conv1d(64, 128, kernel_size=(4,), stride=(2,))
      (1): ReLU()
      (2): Conv1d(128, 256, kernel_size=(1,), stride=(1,))
      (3): GLU(dim=1)
    )
    (2): Sequential(
      (0): Conv1d(128, 256, kernel_size=(4,), stride=(2,))
      (1): ReLU()
      (2): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
      (3): GLU(dim=1)
    )
    (3): Sequential(
      (0): Conv1d(256, 512, kernel_size=(4,), stride=(2,))
      (1): ReLU()
      (2): Conv1d(512, 1024, kernel_size=(1,), stride=(1,))
      (3): GLU(dim=1)
    )
    (4): Sequential(
      (0): Conv1d(512, 768, kernel_size=(4,), stride=(2,))
      (1): ReLU()
      (2): Conv1d(768, 1536, kernel_size=(1,), stride=(1,))
      (3): GLU(dim=1)
    )
    (5-7): 3 x Sequential(
      (0):

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# print("Generate Model:")
# loss_tracker=[]
# unet = UNet(in_channels=1, out_channels=1).to("cuda")  # Adjust the input and output channels
# print('Total number of parameters: %i' % (sum(p.numel() for p in unet.parameters() if p.requires_grad)))
# print("Def. loss function:")
# loss_fn = nn.MSELoss()  #MSE

learning_rate = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#, betas=(0.9, 0.999))

epochs=1000
loss_tracker=[]
for epoch in range(epochs):
    curr_loss=0
    for batch_idx, (noisy_audio, clean_audio) in enumerate(train_loader):
        # Convert data to PyTorch tensors and move them to the device
        noisy_audio = noisy_audio.to(device)
        clean_audio = clean_audio.to(device)
        optimizer.zero_grad()
        Ypred = model(noisy_audio[0])
        clean_audio_idx=clean_audio[0]
        outputlen=len(Ypred[0,0,:])
        clean_audio_trunc=clean_audio_idx[:,:,:outputlen]
        loss = loss_fn(Ypred, clean_audio_trunc)
        loss.backward()
        optimizer.step()
        print(f"Epoch [{epoch + 1}/{epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.10f}")
        curr_loss+=loss.item()
    if epoch % 500 == 0:
      torch.save(model.state_dict(), f"/content/drive/MyDrive/DL_Project/clean_unet_clean_mse_full_ft/{epoch}.pth")
      loss_tracker.append(curr_loss)

# torch.save(model.state_dict(), f"/content/drive/MyDrive/DL_Project/clean_unet_clean_mse_full_ft/{epoch}.pth")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [822/1000], Batch [13/28], Loss: 0.0000505581
Epoch [822/1000], Batch [14/28], Loss: 0.0000465551
Epoch [822/1000], Batch [15/28], Loss: 0.0000200630
Epoch [822/1000], Batch [16/28], Loss: 0.0000852602
Epoch [822/1000], Batch [17/28], Loss: 0.0000419697
Epoch [822/1000], Batch [18/28], Loss: 0.0000203767
Epoch [822/1000], Batch [19/28], Loss: 0.0000123873
Epoch [822/1000], Batch [20/28], Loss: 0.0000073831
Epoch [822/1000], Batch [21/28], Loss: 0.0000133291
Epoch [822/1000], Batch [22/28], Loss: 0.0000465872
Epoch [822/1000], Batch [23/28], Loss: 0.0000406592
Epoch [822/1000], Batch [24/28], Loss: 0.0000440022
Epoch [822/1000], Batch [25/28], Loss: 0.0000128394
Epoch [822/1000], Batch [26/28], Loss: 0.0000316846
Epoch [822/1000], Batch [27/28], Loss: 0.0000132415
Epoch [822/1000], Batch [28/28], Loss: 0.0000231650
Epoch [823/1000], Batch [1/28], Loss: 0.0000500730
Epoch [823/1000], Batch [2/28], Loss: 0.0000455438
E

In [None]:
#Test Predictions
#p232_015
import IPython.display as ipd
ww = model.state_dict()   #read obtained weights
noisy_audio_test, ntsamplerate = librosa.load("/content/noisy_testset_wav/p232_005.wav", mono=False, sr=None)
display(ipd.Audio(noisy_audio_test, rate=ntsamplerate));
noisy_audio_norm_test = noisy_audio_test/np.abs(noisy_audio_test.max())
noisy_audio_norm_test_q=signal2pytorch(noisy_audio_norm_test).to(device)
print(noisy_audio_norm_test_q.shape)
predictions=model(noisy_audio_norm_test_q)
print(predictions)
# predictions=model(predictions)
 # Make Predictions based on the obtained weights, on training set
predictions=predictions.cpu().detach()
predictions=np.array(predictions)
print(predictions)
xrek=predictions[:,0,:]  #remove unnecessary dimension for playback

torch.Size([1, 1, 299838])
tensor([[[0.0182, 0.0266, 0.0189,  ..., 0.0357, 0.0322, 0.0310]]],
       device='cuda:0', grad_fn=<MulBackward0>)
[[[0.01821896 0.02657879 0.01887815 ... 0.03569153 0.03220579 0.03095547]]]


In [None]:
import IPython.display as ipd
print(ntsamplerate)
display(ipd.Audio(xrek, rate=ntsamplerate));

48000


# METRICS

In [None]:
!pip3 install pesq
!pip install torchmetrics


Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pesq
  Building wheel for pesq (setup.py) ... [?25l[?25hdone
  Created wheel for pesq: filename=pesq-0.0.4-cp310-cp310-linux_x86_64.whl size=262926 sha256=2538f3f681c646481e5e5181b29e19c0ac7be64625bb13bfa18a09fccdc4c625
  Stored in directory: /root/.cache/pip/wheels/c5/4e/2c/251524370c0fdd659e99639a0fbd0ca5a782c3aafcd456b28d
Successfully built pesq
Installing collected packages: pesq
Successfully installed pesq-0.0.4
Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-u

In [None]:
import math
from torchmetrics.audio import SignalDistortionRatio

def signalPower(x):
    # print(x)
    return np.average(x**2)
# def SNR(signal, noise):
#     powS = signalPower(signal)
#     powN = signalPower(noise)
#     return 10*math.log10(math.abs(powS-powN)/powN)
def SNRsystem(inputSig, outputSig):
    noise = outputSig-inputSig

    powS = signalPower(outputSig)
    powN = signalPower(noise)
    return 10*math.log10(abs((powS-powN))/powN)

def calculate_snr(clean_audio, noisy_audio):
    method2 = SNRsystem(clean_audio,noisy_audio)
    # print("Result Method 2: {} dB".format(method2))
    return method2

def calculate_sdr(clean_audio, noisy_audio):
  sdr = SignalDistortionRatio().to("cpu")
  sdr_calc = sdr(clean_audio, noisy_audio)
  return sdr_calc

In [32]:
from pesq import pesq
pesq_og_tracker=[]
pesq_pred_tracker=[]
snr_og_tracker=[]
snr_pred_tracker=[]
sdr_og_tracker=[]
sdr_pred_tracker=[]
batch_size=30
model.eval()
for batch_idx, (noisy_audio, clean_audio) in enumerate(train_loader):
  print("processing batch ", batch_idx)
  for i in range (0,30):
    # print("NS",noisy_audio.shape)
    # print("CS",clean_audio.shape)
    temp_audio_noisy = noisy_audio[i]
    temp_audio_noisy=np.array(temp_audio_noisy.cpu())
    xrek_noisy=temp_audio_noisy[:,0,:]
    xrek_noisy=xrek_noisy[0]
    # print("xrek_noisy ",xrek_noisy.shape)
    # display(ipd.Audio(xrek_noisy, rate=48000));
    temp_audio_clean = clean_audio[i]
    temp_audio_clean=np.array(temp_audio_clean.cpu())
    xrek_clean=temp_audio_clean[:,0,:]
    xrek_clean=xrek_clean[0]
    # print("xrek_clean ",xrek_clean.shape)
    # display(ipd.Audio(xrek_clean, rate=48000));
    pesqd_og=pesq(16000,xrek_clean,xrek_noisy,'wb')
    snr_og=calculate_snr(xrek_clean,xrek_noisy)
    pesq_og_tracker.append(pesqd_og)
    snr_og_tracker.append(snr_og)
    # sdr_og=calculate_sdr(xrek_clean,xrek_noisy)
    # sdr_og_tracker.append(sdr_og)
    # print(pesqd_og)
    torch.no_grad()
    predictions=model(noisy_audio[i])
    # print(predictions)
    predictions_np=predictions.cpu().detach()
    predictions_np=np.array(predictions_np)
    xrek_pred=predictions_np[:,0,:]
    xrek_pred=xrek_pred[0]
    # print("xrek_pred ",xrek_pred.shape)
    # display(ipd.Audio(xrek_pred, rate=ntsamplerate));
    pesqd_pred=pesq(16000,xrek_clean,xrek_pred,'wb')
    pesq_pred_tracker.append(pesqd_pred)
    snr_pred=calculate_snr(xrek_clean,xrek_pred)
    snr_pred_tracker.append(snr_pred)
    # sdr_pred=calculate_sdr(clean_audio[i].cpu(),predictions.cpu())
    # sdr_pred_tracker.append(sdr_pred)
    torch.cuda.empty_cache()
    # print(pesqd_pred)


In [None]:
#1000 epochs Logcosh()
print(np.average(pesq_og_tracker))
print(np.average(pesq_pred_tracker))
print(np.average(snr_og_tracker))
print(np.average(snr_pred_tracker))
print(np.average(sdr_og_tracker))
print(np.average(sdr_pred_tracker))

1.86344819219367
2.315099320944073
8.417508879494218
10.583247290667972
nan
nan


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
