In [28]:
import os
import subprocess
from tempfile import NamedTemporaryFile

from torch.distributed import get_rank
from torch.distributed import get_world_size
from torch.utils.data.sampler import Sampler

import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
import librosa
import numpy as np
import scipy.signal
import torch
import torchaudio
import math,glob
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import random

from scipy import signal


windows = {'hamming': scipy.signal.hamming, 'hann': scipy.signal.hann, 'blackman': scipy.signal.blackman,
           'bartlett': scipy.signal.bartlett}


def load_audio(path):
    sound, _ = torchaudio.load(path, normalization=True)
    sound = sound.numpy().T
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    
    return sound


def load_audio_scipy(path):
    sound, _ = torchaudio.load(path, normalization=True)
    sound = sound.numpy().T
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    return sound

def audio_with_noise(sample_path,noise_path):
    sample=load_audio(sample_path)
    p="/home/ubuntu/projects/datasets/noise_data_30_sec/23318.wav"
    noise=load_audio(noise_path)
    if len(sample)<=len(noise):
        data=sample+noise[:len(sample)]
        return data
#     else:
#         noise=load_audio(p)
#         #sample=np.pad(sample, (0, max(0, len(noise) - len(sample))), "constant")
#         data=sample+noise
#         return data
    else:
        #print(noise_path)
        return sample

class AudioParser(object):
    def parse_transcript(self, transcript_path):
        """
        :param transcript_path: Path where transcript is stored from the manifest file
        :return: Transcript in training/testing format
        """
        raise NotImplementedError

    def parse_audio(self, audio_path):
        """
        :param audio_path: Path where audio is stored from the manifest file
        :return: Audio in training/testing format
        """
        raise NotImplementedError
        


class NoiseInjection(object):
    def __init__(self,
                 path=None,
                 sample_rate=16000,
                 noise_levels=(0, 0.5)):
        """
        Adds noise to an input signal with specific SNR. Higher the noise level, the more noise added.
        Modified code from https://github.com/willfrey/audio/blob/master/torchaudio/transforms.py
        """
        if not os.path.exists(path):
            print("Directory doesn't exist: {}".format(path))
            raise IOError
        self.paths = path is not None and librosa.util.find_files(path)
        self.sample_rate = sample_rate
        self.noise_levels = noise_levels

    def inject_noise(self, data):
        noise_path = np.random.choice(self.paths)
        noise_level = np.random.uniform(*self.noise_levels)
        return self.inject_noise_sample(data, noise_path, noise_level)

    def inject_noise_sample(self, data, noise_path, noise_level):
        noise_len = get_audio_length(noise_path)
        noise_data = load_audio(noise_path)

        data_len = len(data) / self.sample_rate
        if len(data) < len(noise_data):
            data=data+noise_data[:len(data)]
        else:
            print(noise_path)
#         noise_start = np.random.rand() * (noise_len - data_len)
#         noise_end = noise_start + data_len
#         noise_dst = audio_with_sox(noise_path, self.sample_rate, noise_start, noise_end)
#         assert len(data) == len(noise_dst)
#         noise_energy = np.sqrt(noise_dst.dot(noise_dst) / noise_dst.size)
#         data_energy = np.sqrt(data.dot(data) / data.size)
#         data += noise_level * noise_dst * data_energy / noise_energy
        return data


class SpectrogramParser(AudioParser):
    def __init__(self, audio_conf, normalize=False, augment=False):
        """
        Parses audio file into spectrogram with optional normalization and various augmentations
        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param normalize(default False):  Apply standard mean and deviation normalization to audio tensor
        :param augment(default False):  Apply random tempo and gain perturbations
        """
        super(SpectrogramParser, self).__init__()
        self.window_stride = audio_conf['window_stride']
        self.window_size = audio_conf['window_size']
        self.sample_rate = audio_conf['sample_rate']
        self.window = windows.get(audio_conf['window'], windows['hamming'])
        self.normalize = normalize
        self.augment = augment
        self.noiseInjector = NoiseInjection(audio_conf['noise_dir'], self.sample_rate,
                                            audio_conf['noise_levels']) if audio_conf.get(
            'noise_dir') is not None else None
        self.noise_prob = audio_conf.get('noise_prob')
        self.noise_list=glob.glob(audio_conf['noise_dir']+"/*.wav")

    def parse_audio(self, audio_path):
#         if self.augment:
#             y = load_randomly_augmented_audio(audio_path, self.sample_rate)
#             print("augument: ")
#         else:
#             #print(audio_path)
#             y = load_audio(audio_path)
        # original target 
        target = load_audio(audio_path)
        #y = load_audio(audio_path)
        
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        
        if self.noiseInjector:
#             add_noise = np.random.binomial(1, self.noise_prob)
#             if add_noise:
            y = self.noiseInjector.inject_noise(target)
#             y=audio_with_noise(audio_path,random.choice(self.noise_list))
            #print("noise_add")
            # STFT  and NOISE SPECT 
            D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                             win_length=win_length, window=self.window)
            spect, phase = librosa.magphase(D)
            # S = log(S+1)
            spect = np.log1p(spect)
            spect = torch.FloatTensor(spect)
            if self.normalize:
                mean = spect.mean()
                std = spect.std()
                spect.add_(-mean)
                spect.div_(std)
#             else:
#                 print("ADD Noise failed")

        else:
            print("Please add noise dir path")

        

            
        # TARGET SPECT
        Target_D = librosa.stft(target, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        target_spect, target_phase = librosa.magphase(Target_D)
        # S = log(S+1)
        target_spect = np.log1p(target_spect)
        target_spect = torch.FloatTensor(target_spect)
        if self.normalize:
            mean = target_spect.mean()
            std = target_spect.std()
            target_spect.add_(-mean)
            target_spect.div_(std)

        #return spect
        #print("y::::--------",torch.FloatTensor(y).shape)
        #print("y::::--------",spect.shape)
#         sample_rate, samples = wavfile.read(audio_path)
#         frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
        #spect,target_spect=spect[:80,:36],target_spect[:80,:36]
        #print(spect[:80,:])
        #print(spect.shape,target_spect.shape)
        h,w=target_spect.shape[0],target_spect.shape[1]
        target_spect=target_spect[:(h-h%4),:(w-w%4)]
        spect=spect[:(h-h%4),:(w-w%4)]
        #print(spect.shape,target_spect.shape)
        return spect,target_spect#torch.FloatTensor(spectrogram)
    

    def parse_transcript(self, transcript_path):
        raise NotImplementedError


class SpectrogramDataset(Dataset, SpectrogramParser):
    def __init__(self, audio_conf, manifest_filepath, labels, normalize=False, augment=False):
        """
        Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by
        a comma. Each new line is a different sample. Example below:

        /path/to/audio.wav,/path/to/audio.txt
        ...

        :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds
        :param manifest_filepath: Path to manifest csv as describe above
        :param labels: String containing all the possible characters to map to
        :param normalize: Apply standard mean and deviation normalization to audio tensor
        :param augment(default False):  Apply random tempo and gain perturbations
        """
        with open(manifest_filepath) as f:
            ids = f.readlines()
        ids = [x.strip().split(',') for x in ids[:2000]]
        self.ids = ids
        self.size = len(ids)
        self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
        super(SpectrogramDataset, self).__init__(audio_conf, normalize, augment)

    def __getitem__(self, index):
        sample = self.ids[index]
        audio_path, transcript_path = sample[0], sample[1]
        spect,target_spect = self.parse_audio(audio_path)
        transcript = self.parse_transcript(transcript_path)
        return spect, transcript,target_spect

    def parse_transcript(self, transcript_path):
        with open(transcript_path, 'r', encoding='utf8') as transcript_file:
            transcript = transcript_file.read().replace('\n', '')
        transcript = list(filter(None, [self.labels_map.get(x) for x in list(transcript)]))
        return transcript

    def __len__(self):
        return self.size


def _collate_fn(batch):
    def func(p):
        return p[0].size(1)

    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
    longest_sample = max(batch, key=func)[0]
    freq_size = longest_sample.size(0)
    minibatch_size = len(batch)
    max_seqlength = longest_sample.size(1)
    inputs = torch.zeros(minibatch_size, 1, freq_size, max_seqlength)
    
    batch1 = sorted(batch, key=lambda sample: sample[2].size(1), reverse=True)
    longest_sample1 = max(batch1, key=func)[0]
    freq_size1 = longest_sample1.size(0)
    minibatch_size1 = len(batch1)
    max_seqlength1 = longest_sample1.size(1)
    target_spects = torch.zeros(minibatch_size1, 1, freq_size1, max_seqlength1)
    
    input_percentages = torch.FloatTensor(minibatch_size)
    target_sizes = torch.IntTensor(minibatch_size)
    targets = []
#     for x in range(minibatch_size):
#         sample = batch[x]
#         tensor = sample[0]
#         target = sample[2]
#         seq_length = tensor.size(1)
#         inputs[x][0].narrow(1, 0, seq_length).copy_(tensor)
#         input_percentages[x] = seq_length / float(max_seqlength)
#         target_sizes[x] = len(target)
#         targets.extend(target)
    targets = torch.IntTensor(targets)
    return inputs,target_spects, targets, input_percentages, target_sizes


class AudioDataLoader(DataLoader):
    def __init__(self, *args, **kwargs):
        """
        Creates a data loader for AudioDatasets.
        """
        super(AudioDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = _collate_fn


class BucketingSampler(Sampler):
    def __init__(self, data_source, batch_size=1):
        """
        Samples batches assuming they are in order of size to batch similarly sized samples together.
        """
        super(BucketingSampler, self).__init__(data_source)
        self.data_source = data_source
        ids = list(range(0, len(data_source)))
        self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]

    def __iter__(self):
        for ids in self.bins:
            np.random.shuffle(ids)
            yield ids

    def __len__(self):
        return len(self.bins)

    def shuffle(self, epoch):
        np.random.shuffle(self.bins)


class DistributedBucketingSampler(Sampler):
    def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
        """
        Samples batches assuming they are in order of size to batch similarly sized samples together.
        """
        super(DistributedBucketingSampler, self).__init__(data_source)
        if num_replicas is None:
            num_replicas = get_world_size()
        if rank is None:
            rank = get_rank()
        self.data_source = data_source
        self.ids = list(range(0, len(data_source)))
        self.batch_size = batch_size
        self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
        self.num_replicas = num_replicas
        self.rank = rank
        self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas

    def __iter__(self):
        offset = self.rank
        # add extra samples to make it evenly divisible
        bins = self.bins + self.bins[:(self.total_size - len(self.bins))]
        assert len(bins) == self.total_size
        samples = bins[offset::self.num_replicas]  # Get every Nth bin, starting from rank
        return iter(samples)

    def __len__(self):
        return self.num_samples

    def shuffle(self, epoch):
        # deterministically shuffle based on epoch
        g = torch.Generator()
        g.manual_seed(epoch)
        bin_ids = list(torch.randperm(len(self.bins), generator=g))
        self.bins = [self.bins[i] for i in bin_ids]


def get_audio_length(path):
    output = subprocess.check_output(['soxi -D \"%s\"' % path.strip()], shell=True)
    return float(output)


def audio_with_sox(path, sample_rate, start_time, end_time):
    """
    crop and resample the recording with sox and loads it.
    """
    with NamedTemporaryFile(suffix=".wav") as tar_file:
        tar_filename = tar_file.name
        sox_params = "sox \"{}\" -r {} -c 1 -b 16 -e si {} trim {} ={} >/dev/null 2>&1".format(path, sample_rate,
                                                                                               tar_filename, start_time,
                                                                                               end_time)
        os.system(sox_params)
        y = load_audio(tar_filename)
        return y


def augment_audio_with_sox(path, sample_rate, tempo, gain):
    """
    Changes tempo and gain of the recording with sox and loads it.
    """
    with NamedTemporaryFile(suffix=".wav") as augmented_file:
        augmented_filename = augmented_file.name
        sox_augment_params = ["tempo", "{:.3f}".format(tempo), "gain", "{:.3f}".format(gain)]
        sox_params = "sox \"{}\" -r {} -c 1 -b 16 -e si {} {} >/dev/null 2>&1".format(path, sample_rate,
                                                                                      augmented_filename,
                                                                                      " ".join(sox_augment_params))
        os.system(sox_params)
        y = load_audio(augmented_filename)
#         os.remove(augmented_filename)
        return y


def load_randomly_augmented_audio(path, sample_rate=16000, tempo_range=(0.85, 1.15),
                                  gain_range=(-6, 8)):
    """
    Picks tempo and gain uniformly, applies it to the utterance by using sox utility.
    Returns the augmented utterance.
    """
    low_tempo, high_tempo = tempo_range
    tempo_value = np.random.uniform(low=low_tempo, high=high_tempo)
    low_gain, high_gain = gain_range
    gain_value = np.random.uniform(low=low_gain, high=high_gain)
    audio = augment_audio_with_sox(path=path, sample_rate=sample_rate,
                                   tempo=tempo_value, gain=gain_value)
    return audio


In [29]:
from tqdm import tqdm
import json,os
from torchsummary import summary


with open('/home/ubuntu/projects/deepspeech.pytorch/labels.json') as label_file:
    labels = str(''.join(json.load(label_file)))

audio_conf = dict(sample_rate=16000,
                  window_size=.01,
                  window_stride=.02,
                  window='hamming',
                  noise_dir="/home/ubuntu/projects/datasets/noise_data_30_sec",
                  noise_prob=.4,
                  noise_levels=(0.0, 0.9))


In [30]:
audio_conf = dict(sample_rate=16000,
                  window_size=.01,
                  window_stride=.02,
                  window='hamming',
                  noise_dir="/home/ubuntu/projects/datasets/noise_data_30_sec",
                  noise_prob=.4,
                  noise_levels=(0.0, 0.9))

In [38]:
train_manifest="/home/ubuntu/projects/deepspeech.pytorch/data/libri_train_manifest.csv"
val_manifest="/home/ubuntu/projects/deepspeech.pytorch/data/libri_test_clean_manifest.csv"
train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels,
                                   normalize=True, augment=True)
test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels,
                                  normalize=True, augment=False)


train_sampler = BucketingSampler(train_dataset, batch_size=32)
train_loader = AudioDataLoader(train_dataset,
                               num_workers=4, batch_sampler=train_sampler)
test_loader = AudioDataLoader(test_dataset, batch_size=2,
                              num_workers=4)

# Autoencoder 

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder,self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=5,stride=1,padding=2),
            nn.BatchNorm2d(8),
            nn.ReLU(True),
#             nn.Hardtanh(0, 20, inplace=True),
            nn.MaxPool2d(2,2),
            nn.Conv2d(8, 16, kernel_size=5,stride=1,padding=2),
            nn.BatchNorm2d(16),
#             nn.Hardtanh(0, 20, inplace=True),
            nn.ReLU(True),
            nn.MaxPool2d(2, 2),
            #nn.Conv2d(16,16,3,padding=(1,1)),
           nn.ReLU(True),
        )
        
                     
        self.decoder = nn.Sequential(
            #F.interpolate(mode='bilinear', scale_factor=2),
            nn.ConvTranspose2d(16,8,kernel_size=2,stride=2),
            nn.ReLU(True),
            #F.interpolate(mode='bilinear', scale_factor=2),
            nn.ConvTranspose2d(8,1,kernel_size=2,stride = 2),
            nn.ReLU(True),
            nn.Tanh()
            
        )
#         self.encoder = nn.Sequential(
#             nn.Conv2d(1, 2, kernel_size=3, stride=1,
#             nn.BatchNorm2d(2),
#             nn.Hardtanh(0, 20, inplace=True),
#             nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
#             nn.BatchNorm2d(32),
#             nn.Hardtanh(0, 20, inplace=True)
#         )

#         self.decoder = nn.Sequential(             
#             nn.ConvTranspose2d(32,32, kernel_size=(41, 11)),
#             nn.ReLU(True),
#             nn.ConvTranspose2d(32,1, kernel_size=(41, 11)),
#             nn.ReLU(True),
#             nn.Tanh()
        
    def forward(self,x):
        x = self.encoder(x)
        #print(x.shape)
        x = self.decoder(x)
        return x

In [40]:
kernel_size=(2,2)
in_height,in_width=80, 24
strides=(1,1)
out_height,out_width=81,27

padding_height = [strides[0] * (in_height - 1) + kernel_size[0] - out_height] 
padding_width  = [strides[1] * (in_width - 1) + kernel_size[1] - out_width] 
padding_height[0],padding_width[0]

(0, -2)

In [41]:
#defining some params
num_epochs = 1 #you can go for more epochs, I am using a mac
if torch.cuda.is_available():
    model = autoencoder().cuda()
else:
    model = autoencoder().cpu()


In [46]:
# summary(model, (1, 81, 28))

In [45]:
distance = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),weight_decay=1e-5)
if torch.cuda.is_available():
    print("Use cuda: ")
for epoch in range(num_epochs):
    for data in train_loader:
        inputs,target_spects, targets, input_percentages, target_sizes = data
        #print(inputs.shape)
        
        #print(inputs.shape)
        if torch.cuda.is_available():
            input_spect = Variable(inputs).cuda()
            target_spect = Variable(target_spects).cuda()
        else:
            input_spect = Variable(inputs).cpu()
            target_spect = Variable(target_spects).cpu()

        # ===================forward=====================
        output = model(input_spect)
        #print("output: ",output.shape,target_spect.shape)
        #print("target_spect: ",target_spect.shape)
        loss = distance(output, target_spect)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.7f}'.format(epoch+1, num_epochs, loss.data.cpu().numpy()))
    torch.save(model, "/home/ubuntu/projects/audioNoiseCanceller_Pytorch/saved_model/anc_model"+str(epoch)+".pth")
#     print('epoch [{}]'.format(epoch+1))
#     print('_numepoch [{}]'.format(num_epochs))
#     print('_numepoch [{}]'.format(loss.data.cpu().numpy()))

epoch [1/1], loss:0.0005272


  "type " + obj.__name__ + ". It won't be checked "


In [44]:
for data in train_loader:
    inputs,target_spects, targets, input_percentages, target_sizes = data
    break


In [None]:
inputs.shape, target_spects.shape, targets.shape,input_percentages.shape, target_sizes.shape

In [None]:
#!pip install torchsummary

In [None]:
summary(model, (1, 81, 29))

In [None]:
torch.cuda.is_available()

In [None]:
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
au_path="/home/ubuntu/projects/deepspeech.pytorch/data/LibriSpeech_dataset/train/wav/4013-182396-0023.wav"
#au_path="/home/ubuntu/projects/deepspeech.pytorch/data/LibriSpeech_dataset/train/wav/2790-142824-0006.wav"
sample_rate, samples = wavfile.read(au_path)
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
print(samples.shape)
print(spectrogram.shape)
plt.pcolormesh(times, frequencies, spectrogram)
plt.imshow(spectrogram)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()

In [None]:
def load_audio(path):
    sound, _ = torchaudio.load(path, normalization=True)
    sound = sound.numpy().T
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    return sound

In [None]:
print(load_audio(au_path).shape)

In [None]:
def audio_with_noise(sample_path,noise_path):
    sample=load_audio(sample_path)
    noise=load_audio(noise_path)
    if len(sample)<=len(noise):
        data=sample+noise[:len(sample)]
        return data
    else:
        sample=np.pad(sample, (0, max(0, len(noise) - len(sample))), "constant")
        data=sample+noise
        return data

n_data=audio_with_noise(au_path,"/home/ubuntu/projects/datasets/noise_data_30_sec/23318.wav")
n_data.shape

In [None]:
sound, _ = torchaudio.load(au_path, normalization=True)
sound = sound.numpy().T
sound.shape[1]
sound = sound.squeeze()
sound.shape

In [None]:
samples

In [None]:
import IPython.display as ipd
ipd.Audio(sound, rate=16000) # load a NumPy array


In [None]:
import IPython.display as ipd
ipd.Audio(n_data, rate=16000) # load a NumPy array

In [None]:
path="/home/ubuntu/projects/datasets/noise_data_30_sec"
paths = path is not None and librosa.util.find_files("/home/ubuntu/projects/datasets/noise_data_30_sec")

In [None]:
paths