In [None]:
import time
import pickle
import warnings
import gc
import copy
import numpy as np
import torch
import torch.nn as nn
import torchaudio
from tqdm import tqdm, tqdm_notebook
from torch.utils.data import Dataset, DataLoader
from matplotlib import colors, pyplot as plt
from IPython.display import clear_output
from pathlib import Path
import IPython.display as ipd


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import librosa
import os

# Define your custom dataset class
def signal2pytorch(x):
    X = np.expand_dims(x, axis=0)  #add channels dimension (here only 1 channel)
    if len(x.shape)==1: #mono:
        X = np.expand_dims(X, axis=0)  #add batch dimension (here only 1 batch)
    X=torch.from_numpy(X)
    X=X.type(torch.Tensor)
    X=X.permute(1,0,2)  #make batch dimension first
    return X

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -r /content/drive/MyDrive/DL_Project/Clean /content/clean ./

!cp -r /content/drive/MyDrive/DL_Project/Noisy  /content/noisy ./

cp: cannot stat '/content/clean': No such file or directory
cp: cannot stat '/content/noisy': No such file or directory


In [None]:
!cp -r /content/drive/MyDrive/DL_Project/TestData/Clean/Clean_1 /content/clean_1 ./

!cp -r /content/drive/MyDrive/DL_Project/TestData/Noisy/Noisy_1  /content/noisy_1 ./

cp: cannot stat '/content/clean_1': No such file or directory
cp: cannot stat '/content/noisy_1': No such file or directory


In [None]:
!unzip /content/Clean/clean_trainset_wav.zip
!unzip /content/Noisy/noisy_trainset_wav.zip

In [None]:

!unzip /content/Clean/clean_testset_wav.zip
!unzip /content/Noisy/noisy_testset_wav.zip

Archive:  /content/Clean/clean_testset_wav.zip
   creating: clean_testset_wav/
  inflating: clean_testset_wav/p232_001.wav  
  inflating: clean_testset_wav/p232_002.wav  
  inflating: clean_testset_wav/p232_003.wav  
  inflating: clean_testset_wav/p232_005.wav  
  inflating: clean_testset_wav/p232_006.wav  
  inflating: clean_testset_wav/p232_007.wav  
  inflating: clean_testset_wav/p232_009.wav  
  inflating: clean_testset_wav/p232_010.wav  
  inflating: clean_testset_wav/p232_011.wav  
  inflating: clean_testset_wav/p232_012.wav  
  inflating: clean_testset_wav/p232_013.wav  
  inflating: clean_testset_wav/p232_014.wav  
  inflating: clean_testset_wav/p232_015.wav  
  inflating: clean_testset_wav/p232_016.wav  
  inflating: clean_testset_wav/p232_017.wav  
  inflating: clean_testset_wav/p232_019.wav  
  inflating: clean_testset_wav/p232_020.wav  
  inflating: clean_testset_wav/p232_021.wav  
  inflating: clean_testset_wav/p232_022.wav  
  inflating: clean_testset_wav/p232_023.wav  
 

In [None]:
SAMPLE_RATE = 48000
N_FFT = (SAMPLE_RATE * 64) // 1000
HOP_LENGTH = (SAMPLE_RATE * 16) // 1000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class SpeechDataset(Dataset):
    def __init__(self, noisy_files, clean_files):
        super().__init__()
        # list of files
        self.noisy_files = sorted(noisy_files)
        self.clean_files = sorted(clean_files)

        self.len_ = len(self.noisy_files)

        # fixed len
        self.max_len = 165000


    def __len__(self):
        return self.len_

    def load_sample(self, file):
        waveform, _ = torchaudio.load(file)
        return waveform

    def __getitem__(self, index):
        # load to tensors and normalization
        clean_audio = self.load_sample(self.clean_files[index])
        noisy_audio = self.load_sample(self.noisy_files[index])
        clean_audio = self._prepare_sample(clean_audio)
        noisy_audio = self._prepare_sample(noisy_audio)
        clean_audio = signal2pytorch(clean_audio).to(device)
        noisy_audio = signal2pytorch(noisy_audio).to(device)
        return noisy_audio, clean_audio

    def _prepare_sample(self, waveform):
        waveform = waveform.numpy()
        current_len = waveform.shape[1]

        output = np.zeros((1, self.max_len), dtype='float32')
        output[0, -current_len:] = waveform[0, :self.max_len]
        output = torch.from_numpy(output)

        return output




In [None]:
TRAIN_INPUT_DIR = Path('/content/noisy_trainset_wav')
TRAIN_TARGET_DIR = Path('/content/clean_trainset_wav')


TEST_INPUT_DIR = Path('/content/noisy_testset_wav')
TEST_TARGET_DIR = Path('/content/clean_testset_wav')

train_input_files = sorted(list(TRAIN_INPUT_DIR.rglob('*.wav')))
train_target_files = sorted(list(TRAIN_TARGET_DIR.rglob('*.wav')))
print(train_input_files)
print(train_target_files)


test_input_files = sorted(list(TEST_INPUT_DIR.rglob('*.wav')))
test_target_files = sorted(list(TEST_TARGET_DIR.rglob('*.wav')))

print("No. of Training files:",len(train_input_files))
# print("No. of Testing files:",len(test_noisy_files))
train_dataset = SpeechDataset(train_input_files, train_target_files)
train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)

test_dataset = SpeechDataset(test_input_files, test_target_files)
test_loader = DataLoader(test_dataset, batch_size=30, shuffle=True)

[PosixPath('/content/noisy_testset_wav/p232_001.wav'), PosixPath('/content/noisy_testset_wav/p232_002.wav'), PosixPath('/content/noisy_testset_wav/p232_003.wav'), PosixPath('/content/noisy_testset_wav/p232_005.wav'), PosixPath('/content/noisy_testset_wav/p232_006.wav'), PosixPath('/content/noisy_testset_wav/p232_007.wav'), PosixPath('/content/noisy_testset_wav/p232_009.wav'), PosixPath('/content/noisy_testset_wav/p232_010.wav'), PosixPath('/content/noisy_testset_wav/p232_011.wav'), PosixPath('/content/noisy_testset_wav/p232_012.wav'), PosixPath('/content/noisy_testset_wav/p232_013.wav'), PosixPath('/content/noisy_testset_wav/p232_014.wav'), PosixPath('/content/noisy_testset_wav/p232_015.wav'), PosixPath('/content/noisy_testset_wav/p232_016.wav'), PosixPath('/content/noisy_testset_wav/p232_017.wav'), PosixPath('/content/noisy_testset_wav/p232_019.wav'), PosixPath('/content/noisy_testset_wav/p232_020.wav'), PosixPath('/content/noisy_testset_wav/p232_021.wav'), PosixPath('/content/noisy_t

In [None]:
#TESING PURPOSES
# for batch_idx, (noisy_audio, clean_audio) in enumerate(train_loader):
#   print("NS",noisy_audio.shape)
#   print("CS",clean_audio.shape)
#   temp_audio = noisy_audio[0]
#   temp_audio=np.array(temp_audio.cpu())
#   xrek_noisy=temp_audio[:,0,:]
#   print("xrek_noisy ",xrek_noisy)
#   display(ipd.Audio(xrek_noisy, rate=48000));
#   xrek_clean=clean_audio[0][:,0,:]
#   print("xrek_clean ",xrek_clean)
#   display(ipd.Audio(xrek_clean.cpu(), rate=48000));
#   break

NS torch.Size([30, 1, 1, 165000])
CS torch.Size([30, 1, 1, 165000])
xrek_noisy  [[0.         0.         0.         ... 0.01190476 0.01011596 0.0160375 ]]


xrek_clean  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0013, 0.0009, 0.0008]],
       device='cuda:0')


In [None]:
class Convautoenc(nn.Module):
    def __init__(self):
        super(Convautoenc, self).__init__()
        #Analysis Filterbank with downsampling of N=1024, filter length of 2N, but only N/2 outputs:
        self.conv1=nn.Conv1d(in_channels=1, out_channels=32, kernel_size=2048, stride=1024, padding=1023, bias=True) #Padding for 'same' filters (kernel_size/2-1)

        #Synthesis filter bank:
        self.synconv1=nn.ConvTranspose1d(in_channels=32, out_channels=1, kernel_size=2048, stride=1024, padding=1023, bias=True)

    def encoder(self, x):
        #Analysis:
        # print("Input shape encoder",x.shape)
        x = self.conv1(x)
        y = torch.tanh(x)
        # print("Output shape encoder",y.shape)
        return y

    def decoder(self, y):
        #Synthesis:
        # print("Input shape DECODER",y.shape)
        xrek= self.synconv1(y)
        # print("Output shape DECODER",xrek.shape)
        return xrek

    def forward(self, x):
        y=self.encoder(x)
        #y=torch.round(y/0.125)*0.125
        xrek=self.decoder(y)
        return xrek

In [None]:
def apply_reduction(losses, reduction="none"):
    if reduction == "mean":
        losses = losses.mean()
    elif reduction == "sum":
        losses = losses.sum()
    return losses

class SNRLoss(torch.nn.Module):
    def __init__(self, zero_mean=True, eps=1e-8, reduction="mean"):
        super(SNRLoss, self).__init__()
        self.zero_mean = zero_mean
        self.eps = eps
        self.reduction = reduction

    def forward(self, input, target):
        if self.zero_mean:
            input_mean = torch.mean(input, dim=-1, keepdim=True)
            target_mean = torch.mean(target, dim=-1, keepdim=True)
            input = input - input_mean
            target = target - target_mean

        res = input - target
        # print("res ",res)
        losses = 10 * torch.log10(
            (target ** 2).sum(-1) / ((res ** 2).sum(-1) + self.eps) + self.eps
        )
        # print("lossess", losses)
        losses = apply_reduction(losses, self.reduction)
        return -losses

In [None]:
!pip install auraloss

Collecting auraloss
  Downloading auraloss-0.4.0-py3-none-any.whl (16 kB)
Installing collected packages: auraloss
Successfully installed auraloss-0.4.0


In [None]:
# import auraloss

# mrstft = auraloss.time.LogCoshLoss()

In [None]:
import auraloss
learning_rate = 1e-5
model = Convautoenc().to("cuda")  # Adjust the input and output channels
loss_tracker=[]
loss_fn = auraloss.time.SISDRLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#, betas=(0.9, 0.999))

In [None]:
# model.load_state_dict(torch.load("/content/drive/MyDrive/DL_Project/normal_checkpoint_snr4900.pth"))
# model.train()

Convautoenc(
  (conv1): Conv1d(1, 32, kernel_size=(2048,), stride=(1024,), padding=(1023,))
  (synconv1): ConvTranspose1d(32, 1, kernel_size=(2048,), stride=(1024,), padding=(1023,))
)

In [None]:
epochs=2000
for epoch in range(epochs):
    curr_loss=0
    for batch_idx, (noisy_audio, clean_audio) in enumerate(train_loader):
        # Convert data to PyTorch tensors and move them to the device
        noisy_audio = noisy_audio.to(device)
        clean_audio = clean_audio.to(device)
        optimizer.zero_grad()
        Ypred = model(noisy_audio[0])
        clean_audio_idx=clean_audio[0]
        outputlen=len(Ypred[0,0,:])
        clean_audio_trunc=clean_audio_idx[:,:,:outputlen]
        loss = loss_fn(Ypred, clean_audio_trunc)
        loss.backward()
        optimizer.step()
        print(f"Epoch [{epoch + 1}/{epochs}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.10f}")
        curr_loss+=loss.item()
    if epoch % 500 == 0:
      torch.save(model.state_dict(), f"/content/drive/MyDrive/DL_Project/sa_checkpoint_lcsh_loss_{epoch}.pth")
      loss_tracker.append(curr_loss)
torch.save(model.state_dict(), "your_unet_checkpoint.pth")
print("loss_tracker",loss_tracker)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [81/2000], Batch [16/28], Loss: 16.6904354095
Epoch [81/2000], Batch [17/28], Loss: 18.4886798859
Epoch [81/2000], Batch [18/28], Loss: 19.0645751953
Epoch [81/2000], Batch [19/28], Loss: 18.5403614044
Epoch [81/2000], Batch [20/28], Loss: 20.0375576019
Epoch [81/2000], Batch [21/28], Loss: 18.6355361938
Epoch [81/2000], Batch [22/28], Loss: 17.5124664307
Epoch [81/2000], Batch [23/28], Loss: 19.3882694244
Epoch [81/2000], Batch [24/28], Loss: 18.2262248993
Epoch [81/2000], Batch [25/28], Loss: 19.8624916077
Epoch [81/2000], Batch [26/28], Loss: 15.9061956406
Epoch [81/2000], Batch [27/28], Loss: 18.2173271179
Epoch [81/2000], Batch [28/28], Loss: 19.1000442505
Epoch [82/2000], Batch [1/28], Loss: 17.9110279083
Epoch [82/2000], Batch [2/28], Loss: 17.6061820984
Epoch [82/2000], Batch [3/28], Loss: 17.0028190613
Epoch [82/2000], Batch [4/28], Loss: 18.3869056702
Epoch [82/2000], Batch [5/28], Loss: 17.7707958221
Epoc

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), f"/content/drive/MyDrive/DL_Project/simple_model_checkpoint.pth")

In [None]:
print(loss_tracker)

[0.422554568387568]


In [None]:
#Test Predictions
#p232_015
import IPython.display as ipd
ww = model.state_dict()   #read obtained weights
noisy_audio_test, ntsamplerate = librosa.load("/content/noisy_testset_wav/p232_005.wav", mono=False, sr=None)
display(ipd.Audio(noisy_audio_test, rate=ntsamplerate));
noisy_audio_norm_test = noisy_audio_test/np.abs(noisy_audio_test.max())
noisy_audio_norm_test_q=signal2pytorch(noisy_audio_norm_test).to(device)
print(noisy_audio_norm_test_q.shape)
predictions=model(noisy_audio_norm_test_q)
print(predictions)
# predictions=model(predictions)
 # Make Predictions based on the obtained weights, on training set
predictions=predictions.cpu().detach()
predictions=np.array(predictions)
print(predictions)
xrek=predictions[:,0,:]  #remove unnecessary dimension for playback

torch.Size([1, 1, 299838])
tensor([[[-0.0027, -0.0024, -0.0012,  ..., -0.0021,  0.0009, -0.0006]]],
       device='cuda:0', grad_fn=<ConvolutionBackward0>)
[[[-0.00271537 -0.00242795 -0.00120523 ... -0.00209441  0.00089706
   -0.00061665]]]


In [None]:
import IPython.display as ipd
print(ntsamplerate)
display(ipd.Audio(xrek, rate=ntsamplerate));

48000


In [None]:
!pip3 install pesq
!pip install torchmetrics

Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pesq
  Building wheel for pesq (setup.py) ... [?25l[?25hdone
  Created wheel for pesq: filename=pesq-0.0.4-cp310-cp310-linux_x86_64.whl size=262925 sha256=ade04de7c2d72e1c92c59db880b848522888d2508863c3b0ce5a118eb159d22e
  Stored in directory: /root/.cache/pip/wheels/c5/4e/2c/251524370c0fdd659e99639a0fbd0ca5a782c3aafcd456b28d
Successfully built pesq
Installing collected packages: pesq
Successfully installed pesq-0.0.4
Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-u

In [None]:
import math
from torchmetrics.audio import SignalDistortionRatio

def signalPower(x):
    # print(x)
    return np.average(x**2)
# def SNR(signal, noise):
#     powS = signalPower(signal)
#     powN = signalPower(noise)
#     return 10*math.log10(math.abs(powS-powN)/powN)
def SNRsystem(inputSig, outputSig):
    noise = outputSig-inputSig

    powS = signalPower(outputSig)
    powN = signalPower(noise)
    return 10*math.log10(abs((powS-powN))/powN)

def calculate_snr(clean_audio, noisy_audio):
    method2 = SNRsystem(clean_audio,noisy_audio)
    # print("Result Method 2: {} dB".format(method2))
    return method2

def calculate_sdr(clean_audio, noisy_audio):
  sdr = SignalDistortionRatio().to("cpu")
  sdr_calc = sdr(clean_audio, noisy_audio)
  return sdr_calc

In [1]:
from pesq import pesq
pesq_og_tracker=[]
pesq_pred_tracker=[]
snr_og_tracker=[]
snr_pred_tracker=[]
sdr_og_tracker=[]
sdr_pred_tracker=[]
batch_size=30
model.eval()
for batch_idx, (noisy_audio, clean_audio) in enumerate(train_loader):
  print("processing batch ", batch_idx)
  for i in range (0,30):
    # print("NS",noisy_audio.shape)
    # print("CS",clean_audio.shape)
    temp_audio_noisy = noisy_audio[i]
    temp_audio_noisy=np.array(temp_audio_noisy.cpu())
    xrek_noisy=temp_audio_noisy[:,0,:]
    xrek_noisy=xrek_noisy[0]
    # print("xrek_noisy ",xrek_noisy.shape)
    # display(ipd.Audio(xrek_noisy, rate=48000));
    temp_audio_clean = clean_audio[i]
    temp_audio_clean=np.array(temp_audio_clean.cpu())
    xrek_clean=temp_audio_clean[:,0,:]
    xrek_clean=xrek_clean[0]
    # print("xrek_clean ",xrek_clean.shape)
    # display(ipd.Audio(xrek_clean, rate=48000));
    # pesqd_og=pesq(16000,xrek_clean,xrek_noisy,'wb')
    # snr_og=calculate_snr(xrek_clean,xrek_noisy)
    # pesq_og_tracker.append(pesqd_og)
    # snr_og_tracker.append(snr_og)
    # sdr_og=calculate_sdr(xrek_clean,xrek_noisy)
    # sdr_og_tracker.append(sdr_og)
    # print(pesqd_og)
    torch.no_grad()
    predictions=model(noisy_audio[i])
    # print(predictions)
    predictions_np=predictions.cpu().detach()
    predictions_np=np.array(predictions_np)
    xrek_pred=predictions_np[:,0,:]
    xrek_pred=xrek_pred[0]
    # print("xrek_pred ",xrek_pred.shape)
    # display(ipd.Audio(xrek_pred, rate=ntsamplerate));
    min_length = min(len(xrek_clean), len(xrek_pred))
    xrek_clean = xrek_clean[:min_length]
    xrek_pred = xrek_pred[:min_length]
    pesqd_pred=pesq(16000,xrek_clean,xrek_pred,'wb')
    pesq_pred_tracker.append(pesqd_pred)
    snr_pred=calculate_snr(xrek_clean,xrek_pred)
    snr_pred_tracker.append(snr_pred)
    # sdr_pred=calculate_sdr(clean_audio[i].cpu(),predictions.cpu())
    # sdr_pred_tracker.append(sdr_pred)
    torch.cuda.empty_cache()
    # print(pesqd_pred)


In [None]:
#300 epochs SIDR Loss()
print(np.average(pesq_og_tracker))
print(np.average(pesq_pred_tracker))
print(np.average(snr_og_tracker))
print(np.average(snr_pred_tracker))
print(np.average(sdr_og_tracker))
print(np.average(sdr_pred_tracker))

nan
1.04630277211805
nan
-0.028832905101080865
nan
nan


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
