In [1]:
from pathlib import Path
import os
import numpy as np
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader


In [2]:
noise_class=3

In [3]:
TRAIN_INPUT_DIR = Path('Datasets/US_Class'+str(noise_class)+'_Train_Input')
TRAIN_TARGET_DIR = Path('Datasets/US_Class'+str(noise_class)+'_Train_Output')
TEST_NOISY_DIR = Path('Datasets/US_Class'+str(noise_class)+'_Test_Input')
TEST_CLEAN_DIR = Path('Datasets/clean_testset_wav')

In [4]:
SAMPLE_RATE = 48000
N_FFT = (SAMPLE_RATE * 64) // 1000 
HOP_LENGTH = (SAMPLE_RATE * 16) // 1000 

In [5]:
 class SpeechDataset(Dataset):
    
    def __init__(self, noisy_files, target_files, n_fft=64, hop_length=16):
        super().__init__()
        
        self.noisy_files = sorted(noisy_files)
        self.target_files = sorted(target_files)
        
        self.n_fft = n_fft
        self.hop_length = hop_length
        
        self.len_ = len(self.noisy_files)
        
        self.max_len = 150000
     
    
    def _len(self):
        return self.len_
      
    def load_sample(self, file):
        waveform, _ = torchaudio.load(file)
        print(_)
        return waveform
  
    def getitem(self, index):

        file_t=self.target_files[index]
        file_n=self.noisy_files[index]
        file_t=str(file_t)
        file_n=str(file_n)
        x_target = self.load_sample(file_t)
        x_noisy = self.load_sample(file_n)
        
        x_target = self.prepare_sample(x_target)
        x_noisy = self.prepare_sample(x_noisy)
        
        x_noisy_stft = torch.stft(input=x_noisy, n_fft=self.n_fft, hop_length=self.hop_length, normalized=True,return_complex=True)
        x_target_stft = torch.stft(input=x_target, n_fft=self.n_fft, hop_length=self.hop_length, normalized=True,return_complex=True)
        # return 0
        return x_noisy_stft, x_target_stft
        
    def prepare_sample(self, waveform):
        waveform = waveform.numpy()
        # print(waveform.shape)
        current_len = waveform.shape[1]
        # print(current_len)
        output = np.zeros((1, self.max_len), dtype='float32')
        # print(output.shape)
        output[0, -current_len:] = waveform[0,:self.max_len]
        output = torch.from_numpy(output)
        # print(output.shape)
        return output

In [6]:
train_input_files = sorted(list(TRAIN_INPUT_DIR.rglob('*.wav')))
train_target_files = sorted(list(TRAIN_TARGET_DIR.rglob('*.wav')))

test_noisy_files = sorted(list(TEST_NOISY_DIR.rglob('*.wav')))
test_clean_files = sorted(list(TEST_CLEAN_DIR.rglob('*.wav')))

# print("No. of Training files:",len(train_input_files))
# print("No. of Testing files:",len(test_noisy_files))

In [7]:
test_dataset = SpeechDataset(test_noisy_files, test_clean_files, N_FFT, HOP_LENGTH)
train_dataset = SpeechDataset(train_input_files, train_target_files, N_FFT, HOP_LENGTH)