# Prepare our dataset for trainig

# Class for feature extraction

In [None]:
%%writefile feature_extractor.py

import librosa
import scipy


class FeatureExtractor:
    def __init__(self, audio, *, windowLength, overlap, sample_rate):
        self.audio = audio
        self.ffT_length = windowLength
        self.window_length = windowLength
        self.overlap = overlap
        self.sample_rate = sample_rate
        self.window = scipy.signal.hamming(self.window_length, sym=False)

    def get_stft_spectrogram(self):
        return librosa.stft(self.audio, n_fft=self.ffT_length, win_length=self.window_length, hop_length=self.overlap,
                            window=self.window, center=True)

    def get_audio_from_stft_spectrogram(self, stft_features):
        return librosa.istft(stft_features, win_length=self.window_length, hop_length=self.overlap,
                             window=self.window, center=True)

    def get_mel_spectrogram(self):
        return librosa.feature.melspectrogram(self.audio, sr=self.sample_rate, power=2.0, pad_mode='reflect',
                                              n_fft=self.ffT_length, hop_length=self.overlap,
                                              win_length=self.window_length, window=self.window,center=True)

    def get_audio_from_mel_spectrogram(self, M):
        return librosa.feature.inverse.mel_to_audio(M, sr=self.sample_rate, n_fft=self.ffT_length,
                                                    hop_length=self.overlap,
                                                    win_length=self.window_length, window=self.window,
                                                    center=True, pad_mode='reflect', power=2.0, n_iter=32, length=None)

# Our dataset: get clean and noisy files in waveform

In [None]:

%%writefile findataset.py
import os
from torch.utils.data import DataLoader, Dataset
import librosa
import numpy as np
import math
from feature_extractor import FeatureExtractor
from utils import prepare_input_features
import multiprocessing
import os
from utils import get_tf_feature, read_audio
import tensorflow as tf
from sklearn.preprocessing import StandardScaler





class FinDataset(Dataset):
    def __init__(self, clean_filenames, noise_filenames, **config):
        self.clean_filenames = clean_filenames #чистые файлы
        self.noise_filenames = noise_filenames #dirty files
        self.sample_rate = config['fs'] #settings for spectrogramm
        self.overlap = config['overlap']
        self.window_length = config['windowLength']
        self.audio_max_duration = config['audio_max_duration']
        

    def _sample_noise_filename(self):
        return np.random.choice(self.noise_filenames) #choose random noise

    def _remove_silent_frames(self, audio): #audio without silent
        trimed_audio = [] 
        indices = librosa.effects.split(audio, hop_length=self.overlap, top_db=20)

        for index in indices:
            trimed_audio.extend(audio[index[0]: index[1]])
        return np.array(trimed_audio)

    def _phase_aware_scaling(self, clean_spectral_magnitude, clean_phase, noise_phase): #в зависимости от фазы преобразуем амплитуду
        assert clean_phase.shape == noise_phase.shape, "Shapes must match."
        return clean_spectral_magnitude * np.cos(clean_phase - noise_phase)

    def get_noisy_audio(self, *, filename): 
        return read_audio(filename, self.sample_rate)

    def _audio_random_crop(self, audio, duration): #случайно вырезаем из аудио аудио заданной длины
        audio_duration_secs = librosa.core.get_duration(audio, self.sample_rate)

        ## duration: length of the cropped audio in seconds
        audio_duration_ms = math.floor(audio_duration_secs * self.sample_rate)
        duration_ms = math.floor(duration * self.sample_rate)
        if duration_ms >= audio_duration_ms:
            print("Passed duration greater than audio duration of: ", audio_duration_ms)
            audio = np.append(audio, np.zeros(duration_ms-audio_duration_ms+5))
        idx = np.random.randint(0, audio_duration_ms - duration_ms)
        return audio[idx: idx + duration_ms]

    def _add_noise_to_clean_audio(self, clean_audio, noise_signal): ##добавляем шум в чистое аудио
        if len(clean_audio) >= len(noise_signal):
            # print("The noisy signal is smaller than the clean audio input. Duplicating the noise.")
            while len(clean_audio) >= len(noise_signal):
                noise_signal = np.append(noise_signal, noise_signal)

        ## Extract a noise segment from a random location in the noise file
        ind = np.random.randint(0, noise_signal.size - clean_audio.size)

        noiseSegment = noise_signal[ind: ind + clean_audio.size]

        speech_power = np.sum(clean_audio ** 2)
        noise_power = np.sum(noiseSegment ** 2)
        noisyAudio = clean_audio + (np.sqrt(speech_power / noise_power) * noiseSegment * np.random.sample())
        #noisyAudio = clean_audio + (np.sqrt(speech_power / noise_power) * noiseSegment * np.random.sample())
        return noisyAudio

    def _parallel_audio_processing(self, clean_filename): ##Обрабатываем одновременно шумное и чистое аудио: читаем, удаляем тишину, 
## смешиваем, получаем смешанную sftf спектрограмму, ее амплитуду и фазу для чистого и смешанного аудио, нормируем амплитуду
        clean_audio, _ = read_audio(clean_filename, self.sample_rate)

        # remove silent frame from clean audio
        #clean_audio = self._remove_silent_frames(clean_audio)

        noise_filename = self._sample_noise_filename()

        # read the noise filename
        noise_audio, sr = read_audio(noise_filename, self.sample_rate)

        # remove silent frame from noise audio
        noise_audio = self._remove_silent_frames(noise_audio)

        # sample random fixed-sized snippets of audio
        clean_audio = self._audio_random_crop(clean_audio, duration=self.audio_max_duration)

        # add noise to input image
        noiseInput = self._add_noise_to_clean_audio(clean_audio, noise_audio)

        # extract stft features from noisy audio
        '''
        noisy_input_fe = FeatureExtractor(noiseInput, windowLength=self.window_length, overlap=self.overlap,
                                          sample_rate=self.sample_rate)
        noise_spectrogram = noisy_input_fe.get_stft_spectrogram()

        # Or get the phase angle (in radians)
        # noisy_stft_magnitude, noisy_stft_phase = librosa.magphase(noisy_stft_features)
        noise_phase = np.angle(noise_spectrogram)

        # get the magnitude of the spectral
        noise_magnitude = np.abs(noise_spectrogram)

        # extract stft features from clean audio
        clean_audio_fe = FeatureExtractor(clean_audio, windowLength=self.window_length, overlap=self.overlap,
                                          sample_rate=self.sample_rate)
        clean_spectrogram = clean_audio_fe.get_stft_spectrogram()
        # clean_spectrogram = cleanAudioFE.get_mel_spectrogram()

        # get the clean phase
        clean_phase = np.angle(clean_spectrogram)

        # get the clean spectral magnitude
        clean_magnitude = np.abs(clean_spectrogram)
        # clean_magnitude = 2 * clean_magnitude / np.sum(scipy.signal.hamming(self.window_length, sym=False))

        clean_magnitude = self._phase_aware_scaling(clean_magnitude, clean_phase, noise_phase)
        mean = np.mean(noise_magnitude)
        std = np.std(noise_magnitude)
        scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
        noise_magnitude = scaler.fit_transform(noise_magnitude)
        clean_magnitude = scaler.transform(clean_magnitude)
        return noise_magnitude, clean_magnitude, noise_phase
   '''     
        return noiseInput, clean_audio
    
    def __getitem__(self, index):
        clean = self.clean_filenames[index]
        noiseInput, clean = self._parallel_audio_processing(clean)
        return noiseInput, clean
    
    def __len__(self):
        l = len(self.clean_filenames)
        return l
    
    '''
    def __getitem__(self, index):
        clean = self.clean_filenames[index]
        noise_m, clean_m, noise_ph = self._parallel_audio_processing(clean)
        noise_m_f = prepare_input_features(noise_m, numSegments=8, numFeatures=129)
        noise_m_f = np.transpose(noise_m_f, (2, 0, 1))
        clean_m = np.transpose(clean_m, (1, 0))
        noise_ph = np.transpose(noise_ph, (1, 0))
        noise_m_f = np.expand_dims(noise_m_f, axis=1)
        clean_m = np.expand_dims(clean_m, axis=2)
        clean_m = np.expand_dims(clean_m, 1)
        return noise_m_f, clean_m, noise_ph '''
    

# Dataset for clean audio filenames

In [None]:
%%writefile mozilla_commonvoice.py

import pandas as pd
import numpy as np
import os

# np.random.seed(9)

class MozillaCommonVoiceDataset:

    def __init__(self, basepath, *, val_dataset_size):
        self.basepath = basepath
        self.val_dataset_size = val_dataset_size

    def get_common_voice_filenames(self, subfolder="train", dataframe_name='train.tsv'):
        full_file_path = os.path.join(self.basepath, subfolder, dataframe_name)
        print("full path", full_file_path)

        mozilla_metadata = pd.read_csv(full_file_path, sep='\t')
        clean_files = mozilla_metadata['path'].values
        np.random.shuffle(clean_files)
        print("Total number of training examples:", len(clean_files))
        return clean_files

    def get_train_val_filenames(self):
        clean_files = self.get_common_voice_filenames(subfolder="train", dataframe_name='train.tsv')

        # resolve full path
        clean_files = [os.path.join(self.basepath, 'train', 'clips', filename+".wav") for filename in clean_files]

        clean_files = clean_files[:-self.val_dataset_size]
        clean_val_files = clean_files[-self.val_dataset_size:]
        print("# of Training clean files:", len(clean_files))
        print("# of  Validation clean files:", len(clean_val_files))
        return clean_files, clean_val_files


    def get_test_filenames(self):
        clean_files = self.get_common_voice_filenames(subfolder="test", dataframe_name='test.tsv')

        # resolve full path
        clean_files = [os.path.join(self.basepath, 'test', 'clips', filename+".wav") for filename in clean_files]

        print("# of Testing clean files:", len(clean_files))
#         print("Clean Test Files: ", clean_files)
        return clean_files

# Dataset for noise audio filenames

In [None]:
%%writefile urban_sound_8k.py

import pandas as pd
import numpy as np
import os

#np.random.seed(9)


class UrbanSound8K:
    def __init__(self, basepath, *, val_dataset_size, class_ids=None):
        self.basepath = basepath
        self.val_dataset_size = val_dataset_size
        self.class_ids = class_ids

    def _get_urban_sound_8K_filenames(self):
        urbansound_metadata = pd.read_csv(os.path.join(self.basepath, 'UrbanSound8K.csv'))

        # shuffle the dataframe
        urbansound_metadata.reindex(np.random.permutation(urbansound_metadata.index))

        return urbansound_metadata

    def _get_filenames_by_class_id(self, metadata):

        if self.class_ids is None:
            self.class_ids = np.unique(metadata['classID'].values)
            print("Number of classes:", self.class_ids)

        all_files = []
        file_counter = 0
        for c in self.class_ids:
            per_class_files = metadata[metadata['classID'] == c][['slice_file_name', 'fold']].values
            per_class_files = [os.path.join(self.basepath, 'fold' + str(file[1]), file[0]) for file in
                               per_class_files]
            print("Class c:", str(c), 'has:', len(per_class_files), 'files')
            file_counter += len(per_class_files)
            all_files.extend(per_class_files)

        assert len(all_files) == file_counter
        return all_files

    def get_train_val_filenames(self):
        urbansound_metadata = self._get_urban_sound_8K_filenames()

        # folds from 0 to 9 are used for training
        urbansound_train = urbansound_metadata[urbansound_metadata.fold != 10]

        urbansound_train_filenames = self._get_filenames_by_class_id(urbansound_train)
        np.random.shuffle(urbansound_train_filenames)

        # separate noise files for train/validation
        urbansound_val = urbansound_train_filenames[-self.val_dataset_size:]
        urbansound_train = urbansound_train_filenames[:-self.val_dataset_size]
        print("Noise training:", len(urbansound_train))
        print("Noise validation:", len(urbansound_val))

        return urbansound_train, urbansound_val

    def get_test_filenames(self):
        urbansound_metadata = self._get_urban_sound_8K_filenames()

        # fold 10 is used for testing only
        urbansound_train = urbansound_metadata[urbansound_metadata.fold == 10]

        urbansound_test_filenames = self._get_filenames_by_class_id(urbansound_train)
        np.random.shuffle(urbansound_test_filenames)

        print("# of Noise testing files:", len(urbansound_test_filenames))
        return urbansound_test_filenames

# Helpful function for our data

In [None]:
%%writefile utils.py 

import numpy as np
import pickle
import librosa
# import sounddevice as sd
from pydub import AudioSegment
import IPython
import tensorflow as tf


def inverse_stft_transform(stft_features, window_length, overlap):
    return librosa.istft(stft_features, win_length=window_length, hop_length=overlap)


def revert_features_to_audio(features, phase, window_length, overlap, cleanMean=None, cleanStd=None):
    # scale the outpus back to the original range
    if cleanMean and cleanStd:
        features = cleanStd * features + cleanMean

    phase = np.transpose(phase, (1, 0))
    features = np.squeeze(features)
    features = features * np.exp(1j * phase)  # that fixes the abs() ope previously done

    features = np.transpose(features, (1, 0))
    return inverse_stft_transform(features, window_length=window_length, overlap=overlap)


def play(audio, sample_rate):
    # ipd.display(ipd.Audio(data=audio, rate=sample_rate))  # load a local WAV file
    IPython.display.Audio(data=audio, rate=sample_rate)
#     sd.play(audio, sample_rate, blocking=True)


def add_noise_to_clean_audio(clean_audio, noise_signal):
    if len(clean_audio) >= len(noise_signal):
        # print("The noisy signal is smaller than the clean audio input. Duplicating the noise.")
        while len(clean_audio) >= len(noise_signal):
            noise_signal = np.append(noise_signal, noise_signal)

    ## Extract a noise segment from a random location in the noise file
    ind = np.random.randint(0, noise_signal.size - clean_audio.size)

    noiseSegment = noise_signal[ind: ind + clean_audio.size]

    speech_power = np.sum(clean_audio ** 2)
    noise_power = np.sum(noiseSegment ** 2)
    noisyAudio = clean_audio + np.sqrt(speech_power / noise_power) * noiseSegment
    return noisyAudio

def read_audio(filepath, sample_rate, normalize=True):
    audio, sr = librosa.load(filepath, sr=sample_rate)
    if normalize is True:
        div_fac = 1 / np.max(np.abs(audio)) / 3.0
        audio = audio * div_fac
        # audio = librosa.util.normalize(audio)
    return audio, sr


def prepare_input_features(stft_features, numSegments, numFeatures):
    noisySTFT = np.concatenate([stft_features[:, 0:numSegments - 1], stft_features], axis=1)
    stftSegments = np.zeros((numFeatures, numSegments, noisySTFT.shape[1] - numSegments + 1))

    for index in range(noisySTFT.shape[1] - numSegments + 1):
        stftSegments[:, :, index] = noisySTFT[:, index:index + numSegments]
    return stftSegments


def get_input_features(predictorsList):
    predictors = []
    for noisy_stft_mag_features in predictorsList:
        inputFeatures = prepare_input_features(noisy_stft_mag_features)
        predictors.append(inputFeatures)

    return predictors


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def get_tf_feature(noise_stft_mag_features, clean_stft_magnitude, noise_stft_phase):
    noise_stft_mag_features = noise_stft_mag_features.astype(np.float32).tostring()
    clean_stft_magnitude = clean_stft_magnitude.astype(np.float32).tostring()
    noise_stft_phase = noise_stft_phase.astype(np.float32).tostring()

    example = tf.train.Example(features=tf.train.Features(feature={
        'noise_stft_phase': _bytes_feature(noise_stft_phase),
        'noise_stft_mag_features': _bytes_feature(noise_stft_mag_features),
        'clean_stft_magnitude': _bytes_feature(clean_stft_magnitude)}))
    return example

#  Train Our Models

# Import libraries for training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import pandas as pd
import os
import datetime
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
import librosa.display
import scipy
import glob
import numpy as np
import math
import warnings
import pickle
from sklearn.utils import shuffle
from feature_extractor import FeatureExtractor
from sklearn.preprocessing import StandardScaler
# Load the TensorBoard notebook extension.
%load_ext tensorboard

In [None]:
class MyModelExpencive1(nn.Module):
    def __init__(self):
        super(MyModelExpencive1, self).__init__()
        self.pad = nn.ZeroPad2d((0, 0, 3, 4))
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(9, 16), stride=(1, 1), padding=(0, 0), bias=False)
        self.relu = nn.PReLU()
        self.batchnorm1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 1), stride=(2, 1), padding=(1, 0), bias=False)
        self.batchnorm2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 1), stride=(2, 1), padding=(1, 0), bias=False)
        self.batchnorm3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=(3, 1), stride=(2, 1), padding=(1, 0), bias=False)
        self.batchnorm4 = nn.BatchNorm2d(256)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=(3, 1), stride=(2, 1), padding=(1, 0), bias=False)
        self.batchnorm5 = nn.BatchNorm2d(512)
        
        self.conv5_1 = nn.Conv2d(512, 1024, kernel_size=(3, 1), stride=(8, 1), padding=(1, 0), bias=False)
        self.batchnorm5_1 = nn.BatchNorm2d(1024)
        self.conv6_1 = nn.Conv2d(1024, 512, kernel_size=(3, 1), stride=(1, 8), padding=(1, 0), bias=False)
        self.batchnorm6_1 = nn.BatchNorm2d(512)
        
        self.conv6 = nn.Conv2d(512, 256, kernel_size=(3, 1), stride=(1, 2), padding=(1, 0), bias=False)
        self.batchnorm6 = nn.BatchNorm2d(256)
        self.conv7 = nn.Conv2d(256, 128, kernel_size=(3, 1), stride=(1, 2), padding=(1, 0), bias=False)
        #self.conv7 = nn.ConvTranspose2d(256, 128, kernel_size=(3, 1), stride=(2, 1), padding=(2, 0), bias=False)
        self.batchnorm7 = nn.BatchNorm2d(128)
        self.conv8 = nn.Conv2d(128, 64, kernel_size=(3, 1), stride=(1, 2), padding=(1, 0), bias=False)
        self.batchnorm8 = nn.BatchNorm2d(64)
        self.conv9 = nn.Conv2d(64, 32, kernel_size=(3, 1), stride=(1, 2), padding=(1, 0), bias=False)
        self.batchnorm9 = nn.BatchNorm2d(32)
        self.spatialdropout = nn.Dropout2d(0.2)
        self.conv10 = nn.Conv2d(32, 1, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
        self.upsample = nn.Upsample(scale_factor=2, mode='bicubic')
        self.upsample2 = nn.Upsample(scale_factor=8, mode='bicubic')
        self.pad2 = nn.ZeroPad2d((0, 0, 1, 0))
        self.pad3 = nn.ZeroPad2d((0, 0, 0, 1))
        self.pad4 = nn.ZeroPad2d((0, 0, 1, 0))
    def forward(self, x):
        #print(x.size())
        x = self.pad(x)
        #print(x.size())
        #x = self.conv1(x)
        skip9 = self.conv1(x)
        x = self.relu(skip9)
        #print(x.size())
        #x = self.batchnorm1(x)
        skip8 = self.conv2(x)
        x = self.relu(skip8)
        #print(x.size())
        x = self.batchnorm2(x)
        skip7 = self.conv3(x)
        x = self.relu(skip7)
        #print(x.size())
        x = self.batchnorm3(x)
        skip6 = self.conv4(x)
        x = self.relu(skip6)
        #print(x.size())
        x = self.batchnorm4(x)
        skip6_1 = self.conv5(x)
        #x = self.pad3(x)
        #print(x.size())
        x = self.relu(skip6_1)
        #print(x.size())
        #x = self.batchnorm5(x)
        
        x = self.conv5_1(x)
        #x = self.pad3(x)
        #print(x.size())
        x = self.relu(x)
        #print(x.size())
        x = self.batchnorm5_1(x)
        x = self.upsample2(x)
        x = self.conv6_1(x)
        #print(x.size())
        x = x + skip6_1
        
        
        
        x = self.upsample(x)
        x = self.conv6(x)
        #print(x.size())
        x = x + skip6
        x = self.relu(x)
        #print(x.size())
        x = self.batchnorm6(x)
        x = self.upsample(x)
        #x = self.pad4(x)
        x = self.conv7(x)
        x = x + skip7
        x = self.relu(x)
        #print(x.size())
        x = self.batchnorm7(x)
        x = self.upsample(x)
        x = self.conv8(x)
        #x = self.pad3(x)
        x = x + skip8
        x = self.relu(x)
        #print(x.size())
        x = self.batchnorm8(x)
        x = self.upsample(x)
        #x = self.pad4(x)
        x = self.conv9(x)
        x = x + skip9
        x = self.relu(x)
        #print(x.size())
        #x = self.batchnorm9(x)
        x = self.spatialdropout(x)
        x = self.conv10(x)
        x = self.pad2(x)
        #print(x.size())
        return x

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MyModelLight(nn.Module):
    def __init__(self):
        super(MyModelLight, self).__init__()
        self.pad = nn.ZeroPad2d((0, 0, 4, 4))
        self.conv1 = nn.Conv2d(1, 8, kernel_size=(9, 16), stride=(1, 1), padding=(0, 0), bias=False)
        self.relu = nn.PReLU()
        self.batchnorm1 = nn.BatchNorm2d(8)
        self.conv2 = nn.Conv2d(8, 18, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0), bias=False)
        self.batchnorm2 = nn.BatchNorm2d(18)
        self.conv3 = nn.Conv2d(18, 30, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0), bias=False)
        self.batchnorm3 = nn.BatchNorm2d(30)
        self.conv4 = nn.Conv2d(30, 18, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0), bias=False)
        self.batchnorm4 = nn.BatchNorm2d(18)
        self.conv5 = nn.Conv2d(18, 30, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0), bias=False)
        self.batchnorm5 = nn.BatchNorm2d(30)
        self.conv5_1 = nn.Conv2d(30, 120, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0), bias=False)
        self.batchnorm5_1 = nn.BatchNorm2d(120)
        self.conv5_2 = nn.Conv2d(120, 30, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0), bias=False)
        self.batchnorm5_2 = nn.BatchNorm2d(30)
        self.conv6 = nn.Conv2d(30, 18, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0), bias=False)
        self.batchnorm6 = nn.BatchNorm2d(18)
        self.conv7 = nn.Conv2d(18, 30, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0), bias=False)
        self.batchnorm7 = nn.BatchNorm2d(30)
        self.conv8 = nn.Conv2d(30, 18, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0), bias=False)
        self.batchnorm8 = nn.BatchNorm2d(18)
        self.conv9 = nn.Conv2d(18, 8, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0), bias=False)
        self.batchnorm9 = nn.BatchNorm2d(8)
        self.spatialdropout = nn.Dropout2d(0.2)
        self.conv10 = nn.Conv2d(8, 1, kernel_size=(9, 1), stride=(1, 1), padding=(4, 0))

    def forward(self, x):
        print(x.size())
        x = self.pad(x)
        print(x.size())
        x = self.conv1(x)
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm1(x)
        skip8 = self.conv2(x)
        x = self.relu(skip8)
        print(x.size())
        x = self.batchnorm2(x)
        x = self.conv3(x)
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm3(x)
        x = self.conv4(x)
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm4(x)
        skip6 = self.conv5(x)
        print(skip6.size())
        x = self.relu(skip6)
        print(x.size())
        x = self.batchnorm5(x)
        
        x = self.conv5_1(x)
        print(x.size())
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm5_1(x)
        x = self.conv5_2(x)
        x = x +skip6
        print(x.size())
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm5_2(x)
        
        x = self.conv6(x)
  
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm6(x)
        x = self.conv7(x) 
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm7(x)
        x = self.conv8(x)
        x = x + skip8
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm8(x)
        x = self.conv9(x)
        x = self.relu(x)
        print(x.size())
        x = self.batchnorm9(x)
        x = self.spatialdropout(x)
        x = self.conv10(x)
        print(x.size())
        return x

# Get device

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
#print(torch.cuda.get_device_name())

# Install wandb

In [None]:
!pip install wandb

# Set constant's values

In [None]:
windowLength = 256
overlap      = round(0.25 * windowLength) # overlap of 75%
ffTLength    = windowLength
inputFs      = 48e3
fs           = 16000
numFeatures  = ffTLength//2 + 1
numSegments  = 16
print("windowLength:",windowLength)
print("overlap:",overlap)
print("ffTLength:",ffTLength)
print("inputFs:",inputFs)
print("fs:",fs)
print("numFeatures:",numFeatures)
print("numSegments:",numSegments)

# Functions for feature extraction

In [None]:
def revert_features_to_audio(features, phase, cleanMean=None, cleanStd=None):
    # scale the outpus back to the original range
    if cleanMean and cleanStd:
        features = cleanStd * features + cleanMean
    phase = np.transpose(phase, (1, 0))
    features = np.squeeze(features)

    # features = librosa.db_to_power(features)
    features = features * np.exp(1j * phase)  # that fixes the abs() ope previously done

    features = np.transpose(features, (1, 0))
    return noiseAudioFeatureExtractor.get_audio_from_stft_spectrogram(features)

In [None]:
def prepare_input_features(stft_features, numSegments, numFeatures):
    noisySTFT = np.concatenate([stft_features[:, 0:numSegments - 1], stft_features], axis=1) #важная конкатенация

    stftSegments = np.zeros((numFeatures, numSegments, noisySTFT.shape[1] - numSegments + 1))
    for index in range(noisySTFT.shape[1] - numSegments + 1):
        stftSegments[:, :, index] = noisySTFT[:, index:index + numSegments]
    return stftSegments

# Metrics

# Install PESQ and STOI

In [None]:
!pip install pypesq
!pip3 install pystoi

# Define SNR

In [None]:
def getPower(signal):

    return np.sqrt(np.sum(signal**2)/np.size(signal))

In [None]:
def getSNR(clean, noise):
    speech_power = getPower(clean)
    noise_power = getPower(noise)
    
    # snr between clean speech and noise
    snrCNa = speech_power / noise_power
    
    # compute amplified noise
    noiseAmp = snrCNa * noise
    # amplified noise power
    noiseAmp_power = getPower(noiseAmp)
    
    snrCNa = speech_power / noiseAmp_power
    
#     print("speech_power:", speech_power)
#     print("noise_power:", noise_power)
#     print("snr clean & noise:", round(snrCNa, 3))
    return snrCNa



def getSNR_dB(snrVal):
    return 20*np.log10(snrVal)

# TRAINING AND VALIDATION ON EPOCH

# Train on epoch

In [None]:
def revert_features_to_audio(features, phase, noisy_input_fe, cleanMean=None, cleanStd=None):
    # scale the outpus back to the original range
    if cleanMean and cleanStd:
        features = cleanStd * features + cleanMean

    phase = np.transpose(phase, (1, 0))
    features = np.squeeze(features)

    # features = librosa.db_to_power(features)
    features = features * np.exp(1j * phase)  # that fixes the abs() ope previously done

    features = np.transpose(features, (1, 0))
    return noisy_input_fe.get_audio_from_stft_spectrogram(features)

In [None]:
from tqdm import tqdm
import soundfile as sf
from pypesq import pesq
from pystoi import stoi

def train_on_epoch(MyModel, loss_fn, optimizer, train_loader, train_dataset, best_diff_stoi=-1):
    model.train()
    PATH = "/kaggle/working/model_weights/best_expensive.pth"
    list_c = []
    list_d = []
    for noise_inp, clean in tqdm(train_loader, desc='Train'):
        noise_inp = torch.flatten(noise_inp)
        clean = torch.flatten(clean)
        #print(clean.shape)
        noise_inp, clean = noise_inp.numpy(), clean.numpy()
        #list_c.append(noise_inp)
        '''list_c.append(noise_inp)
        list_d.append(clean)'''

        noisy_input_fe = FeatureExtractor(noise_inp, windowLength=train_dataset.window_length, overlap=train_dataset.overlap,
                                          sample_rate=train_dataset.sample_rate)
        noise_spectrogram = noisy_input_fe.get_stft_spectrogram()
        noise_magnitude = np.abs(noise_spectrogram)
        
        noise_ph = np.angle(noise_spectrogram)

        # get the magnitude of the spectral
        noise_magnitude = np.abs(noise_spectrogram)

        # extract stft features from clean audio
        clean_audio_fe = FeatureExtractor(clean, windowLength=train_dataset.window_length, overlap=train_dataset.overlap,
                                          sample_rate=train_dataset.sample_rate)
        clean_spectrogram = clean_audio_fe.get_stft_spectrogram()
        # clean_spectrogram = cleanAudioFE.get_mel_spectrogram()

  

        # get the clean spectral magnitude
        clean_magnitude = np.abs(clean_spectrogram)
        #print(clean_magnitude.shape)
        # clean_magnitude = 2 * clean_magnitude / np.sum(scipy.signal.hamming(self.window_length, sym=False))
        clean_phase = np.angle(clean_spectrogram)
        clean_magnitude = train_dataset._phase_aware_scaling(clean_magnitude, clean_phase, noise_ph)
        #mean = np.mean(noise_magnitude)
        #std = np.std(noise_magnitude)

        #scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
        #noise_magnitude = scaler.fit_transform(noise_magnitude)

        #clean_magnitude = scaler.transform(clean_magnitude)

        noise_m_f = prepare_input_features(noise_magnitude, numSegments=16, numFeatures=129)
        noise_m_f = np.transpose(noise_m_f, (2, 0, 1))
        clean_magnitude = np.transpose(clean_magnitude, (1, 0))
        #noise_ph = np.transpose(noise_ph, (1, 0))
        noise_m_f = np.expand_dims(noise_m_f, axis=1)
        clean_magnitude = np.expand_dims(clean_magnitude, axis=2)
        clean_magnitude = np.expand_dims(clean_magnitude, 1)

        
        
        x, y = torch.from_numpy(noise_m_f).to(device), torch.from_numpy(clean_magnitude).to(device)

        #print(x.size())
        optimizer.zero_grad()
        output = model(x.float())

        loss = loss_fn(output, y.float())
        print("loss: ", loss)
        wandb.log({"loss": loss})
        #print(output.detach().numpy().shape)
        denoisedAudioFullyConvolutional = revert_features_to_audio(output.cpu().detach().numpy(), noise_ph, noisy_input_fe)
        inv_clean = revert_features_to_audio(y.cpu().detach().numpy(), noise_ph, noisy_input_fe)
        #print(denoisedAudioFullyConvolutional.shape)
        list_d.append(denoisedAudioFullyConvolutional)
        list_c.append(inv_clean)
        pesq_noisy = pesq(clean, noise_inp, train_dataset.sample_rate)
        pesq_denoised = pesq(clean, denoisedAudioFullyConvolutional, train_dataset.sample_rate)
        stoi_noisy = stoi(clean, noise_inp, train_dataset.sample_rate, extended=False)
        stoi_denoised = stoi(clean, denoisedAudioFullyConvolutional, train_dataset.sample_rate, extended=False)
        snr = getSNR_dB(getSNR(clean, noise_inp))
        snr1 = getSNR_dB(getSNR(clean, denoisedAudioFullyConvolutional))
        wandb.log({"pesq": pesq_denoised})
        wandb.log({"stoi": stoi_denoised})
        wandb.log({"pesq_denoised - pesq_noisy": pesq_denoised - pesq_noisy})
        wandb.log({"stoi_denoised - stoi_noisy": stoi_denoised - stoi_noisy})
        '''wandb.log({"outputs": wandb.Image(output)})
        wandb.log({"inputs_x": wandb.Image(x)})
        wandb.log({"inputs_y": wandb.Image(y)})'''
        
        if (stoi_denoised - stoi_noisy) + (pesq_denoised - pesq_noisy) *0.65 > best_diff_stoi:
            print((stoi_denoised - stoi_noisy) + (pesq_denoised - pesq_noisy))
            best_diff_stoi = (stoi_denoised - stoi_noisy) + (pesq_denoised - pesq_noisy) * 0.65
            torch.save(model.state_dict(), PATH)
            
            
        loss.backward()
        optimizer.step()
        i = 1 

        
        
    return best_diff_stoi

# Valid on epoch

In [None]:
from tqdm import tqdm
import soundfile as sf
from pypesq import pesq
from pystoi import stoi
import gc

def revert_features_to_audio(features, phase, noisy_input_fe, cleanMean=None, cleanStd=None):
    # scale the outpus back to the original range
    if cleanMean and cleanStd:
        features = cleanStd * features + cleanMean

    phase = np.transpose(phase, (1, 0))
    features = np.squeeze(features)

    # features = librosa.db_to_power(features)
    features = features * np.exp(1j * phase)  # that fixes the abs() ope previously done

    features = np.transpose(features, (1, 0))
    return noisy_input_fe.get_audio_from_stft_spectrogram(features)


def evaluate_on_epoch(MyModel, loss_fn, val_loader, val_dataset):
    model.eval()
    total_MAE = 0
    total_PESQ_noisy = 0
    total_PESQ_denoised = 0
    total_STOI_noisy = 0
    total_STOI_denoised = 0
    total_SNR_noisy = 0
    total_SNR_denoised = 0
    i = 0
    
    for noise_inp, clean in tqdm(val_loader, desc='Validation'):
        noise_inp = torch.flatten(noise_inp)
        clean = torch.flatten(clean)
        #print(clean.shape)
        noise_inp, clean = noise_inp.numpy(), clean.numpy()
        #list_c.append(noise_inp)
        '''list_c.append(noise_inp)
        list_d.append(clean)'''

        noisy_input_fe = FeatureExtractor(noise_inp, windowLength=val_dataset.window_length, overlap=val_dataset.overlap,
                                          sample_rate=val_dataset.sample_rate)
        noise_spectrogram = noisy_input_fe.get_stft_spectrogram()
        noise_magnitude = np.abs(noise_spectrogram)
        
        noise_ph = np.angle(noise_spectrogram)

        # get the magnitude of the spectral
        noise_magnitude = np.abs(noise_spectrogram)

        # extract stft features from clean audio
        clean_audio_fe = FeatureExtractor(clean, windowLength=val_dataset.window_length, overlap=val_dataset.overlap,
                                          sample_rate=val_dataset.sample_rate)
        clean_spectrogram = clean_audio_fe.get_stft_spectrogram()
        # clean_spectrogram = cleanAudioFE.get_mel_spectrogram()

  

        # get the clean spectral magnitude
        clean_magnitude = np.abs(clean_spectrogram)
        #print(clean_magnitude.shape)
        # clean_magnitude = 2 * clean_magnitude / np.sum(scipy.signal.hamming(self.window_length, sym=False))
        clean_phase = np.angle(clean_spectrogram)
        clean_magnitude = val_dataset._phase_aware_scaling(clean_magnitude, clean_phase, noise_ph)
        mean = np.mean(noise_magnitude)
        std = np.std(noise_magnitude)

        #scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
        #noise_magnitude = scaler.fit_transform(noise_magnitude)

        #clean_magnitude = scaler.transform(clean_magnitude)

        noise_m_f = prepare_input_features(noise_magnitude, numSegments=16, numFeatures=129)
        noise_m_f = np.transpose(noise_m_f, (2, 0, 1))
        clean_magnitude = np.transpose(clean_magnitude, (1, 0))
        #noise_ph = np.transpose(noise_ph, (1, 0))
        noise_m_f = np.expand_dims(noise_m_f, axis=1)
        clean_magnitude = np.expand_dims(clean_magnitude, axis=2)
        clean_magnitude = np.expand_dims(clean_magnitude, 1)

        
        
        x, y = torch.from_numpy(noise_m_f).to(device), torch.from_numpy(clean_magnitude).to(device)

        #print(x.size())
        output = model(x.float())

        loss = loss_fn(output, y.float())
        
        #wandb.log({"loss": loss})
        #print(output.detach().numpy().shape)
        denoisedAudioFullyConvolutional = revert_features_to_audio(output.cpu().detach().numpy(), noise_ph, noisy_input_fe)
        inv_clean = revert_features_to_audio(y.cpu().detach().numpy(), noise_ph, noisy_input_fe)
        #print(denoisedAudioFullyConvolutional.shape)
        list_d.append(denoisedAudioFullyConvolutional)
        list_c.append(inv_clean)
        pesq_noisy = pesq(clean, noise_inp, val_dataset.sample_rate)
        pesq_denoised = pesq(clean, denoisedAudioFullyConvolutional, val_dataset.sample_rate)
        pesq_clean = pesq(clean, inv_clean, val_dataset.sample_rate)
        st_noisy = stoi(clean, noise_inp, val_dataset.sample_rate, extended=False)
        st_denoised = stoi(clean, denoisedAudioFullyConvolutional, val_dataset.sample_rate, extended=False)
        st_clean = stoi(clean, inv_clean, val_dataset.sample_rate, extended=False)
        snr_noisy = getSNR_dB(getSNR(clean, noise_inp))
        snr_denoised = getSNR_dB(getSNR(clean, denoisedAudioFullyConvolutional))
        '''print("pesq score noisy: ",score)
        print("pesq score denoised: ",score1)
        print("stoi value noisy: ", st)
        print("stoi value denoised: ", st1)
        print("snr value noisy: ", snr)
        print("snr value denoised: ", snr1)
        print("pesq score clean: ",score2)
        print("stoi value clean: ", st2)'''
        total_MAE += loss.item()
        total_PESQ_noisy += pesq_noisy
        total_PESQ_denoised += pesq_denoised
        total_STOI_noisy += st_noisy
        total_STOI_denoised += st_denoised
        total_SNR_noisy += snr_noisy
        total_SNR_denoised += snr_denoised

        i += 1 
        #if i >= 1:
            #return list_c, list_d
    pesq_denoised = total_PESQ_denoised / i
    stoi_denoised =  total_STOI_denoised / i
    pesq_noisy = total_PESQ_noisy / i
    stoi_noisy = total_STOI_noisy / i
    wandb.log({"val pesq": pesq_denoised})
    wandb.log({"val stoi": stoi_denoised})
    wandb.log({"val pesq_denoised - pesq_noisy": pesq_denoised - pesq_noisy})
    wandb.log({"val stoi_denoised - stoi_noisy": stoi_denoised - stoi_noisy})

    gc.collect()
    torch.cuda.empty_cache()

    

In [None]:
!mkdir /kaggle/working/model_weights

# Full training cycle

In [None]:
import torch
import wandb
from mozilla_commonvoice import MozillaCommonVoiceDataset
from urban_sound_8k import UrbanSound8K
# from urban_sound_8K import UrbanSound8K
from findataset import FinDataset
import warnings
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
wandb.init(project='speech_denoising_final_very_cheap')
config = wandb.config
config.learning_rate = 1e-3

warnings.filterwarnings(action='ignore')

mozilla_basepath = "../input/commonvoice2/commonvoice"
urbansound_basepath = "../input/urbansound8k"



#model = MyModel3().to(device)
from torch.optim import Adam
#optimizer = Adam(list(model.parameters()), lr=(1e-5*2))
windowLength = 256
config = {'windowLength': windowLength,
          'overlap': round(0.25 * windowLength),
          'fs': 16000,
          'audio_max_duration': 0.8}
num_epochs = 10

loss_fn = nn.L1Loss()
n = 0
best_diff_stoi=-1
for epoch in range(num_epochs):
    mcv = MozillaCommonVoiceDataset(mozilla_basepath, val_dataset_size=100)
    clean_train_filenames, clean_val_filenames = mcv.get_train_val_filenames() 

    us8K = UrbanSound8K(urbansound_basepath, val_dataset_size=1000)
    noise_train_filenames, noise_val_filenames = us8K.get_train_val_filenames()
    
    val_dataset = FinDataset(clean_val_filenames, noise_val_filenames, **config)  # Создание тренировочного, валидационного и тестового датасетов
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)
    train_dataset = FinDataset(clean_train_filenames, noise_train_filenames, **config)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  #15

    ## Create Test Set
    clean_test_filenames = mcv.get_test_filenames() 

    noise_test_filenames = us8K.get_test_filenames()
    noise_test_filenames = noise_test_filenames

    test_dataset = FinDataset(clean_test_filenames, noise_test_filenames, **config)
    print("epoch", n)
    if n > 6:
        optimizer = torch.optim.Adam(list(model.parameters()), lr=1e-3)
    else:
        optimizer = torch.optim.Adam(list(model.parameters()), lr=1e-3)
    n += 1
    best_diff_stoi = train_on_epoch(model, loss_fn, optimizer, train_loader, train_dataset, best_diff_stoi)
    #with torch.no_grad():
        #evaluate_on_epoch(model, loss_fn, val_loader, val_dataset)