In [None]:
! pip install librosa
! pip install glob2
! pip install matplotlib

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
import librosa
import librosa.display as libd
import IPython.display as ipd
import matplotlib.pylab as plt
import random
from glob import glob
from itertools import cycle


In [2]:
def load_real_audio_files():
    index = {}
    for i in range(1, 61):
        index[i] = glob(f"./MS-SNSD/CleanSpeech_training/clnsp{i}.wav")
    return index
        

In [3]:
def load_noisy_audio_files():
    index= {}
    for i in range(1, 61):
        index[i] = glob(f"./MS-SNSD/NoisySpeech_training/*clnsp{i}.wav")
    return index

In [4]:
# Load real audio and random noise files
real_audio_file_index = load_real_audio_files()
noisy_audio_file_index = load_noisy_audio_files()

In [45]:
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Conv1D, BatchNormalization, MaxPooling2D, Dense, Flatten, LSTM


# Define the CNN model for denoising
def build_cnn_model(input_shape):
    '''model = models.Sequential([
        layers.Conv1D(64, 50, padding='same', activation='relu', input_shape=input_shape),
        layers.Conv1D(64, 50, padding='same', activation='relu'),
        layers.Conv1D(1, 50, padding='same', activation='linear')  # Output denoised audio
    ])'''

   model = Sequential()
    # CNN layers suitable for 1D data
    model.add(Conv1D(16, 3, activation='relu', padding='same', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(2))
    model.add(Conv1D(32, 3, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(2))
    model.add(Conv1D(64, 3, activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Adjust the final layer based on your specific need
    return model


# Define the generator network for detail enhancement
def build_generator(input_shape):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(256, activation='relu'),
        layers.Dense(512, activation='relu'),
        layers.Dense(1024, activation='relu'),
        # layers.Reshape((input_shape[0], 1)),
        layers.Dense(input_shape[0], activation='tanh')  # Output shape matches input shape
    ])
    return model

# Define the discriminator network
def build_discriminator(input_shape):
    model = models.Sequential([
        #layers.Flatten(input_shape=input_shape),  # Flatten input if necessary
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

def build_gan(generator, discriminator):
    discriminator.trainable = False
    gan_input = tf.keras.Input(shape=(max_length, 1))
    generated_samples = generator(gan_input)
    gan_output = discriminator(generated_samples)
    gan = tf.keras.Model(inputs=gan_input, outputs=gan_output)
    gan.compile(optimizer='adam', loss='binary_crossentropy')
    return gan

# Function to find the maximum length of audio files
def find_max_length(audio_files):
    max_length = 0
    for files in audio_files.values():
        for file_path in files:
            audio, _ = librosa.load(file_path, sr=16000)
            if len(audio) > max_length:
                max_length = len(audio)
    return max_length

# Load audio and pad to the maximum length
def load_and_pad_audio(file_path, max_length, sr=16000):
    audio, _ = librosa.load(file_path, sr=sr)
    if len(audio) < max_length:
        audio = np.pad(audio, (0, max_length - len(audio)), mode='constant')
    return audio[:max_length]

# Prepare the dataset
def prepare_dataset(clean_audio_dict, noisy_audio_dict, max_length, sr=16000):
    X_train = []
    y_train = []

    for key in clean_audio_dict.keys():
        clean_files = clean_audio_dict[key]
        noisy_files_list = noisy_audio_dict[key]

        for clean_file in clean_files:
            clean_audio = load_and_pad_audio(clean_file, max_length, sr)
            for noisy_file in noisy_files_list:
                noisy_audio = load_and_pad_audio(noisy_file, max_length, sr)
                X_train.append(noisy_audio)
                y_train.append(clean_audio)

    return np.array(X_train).reshape(-1, max_length, 1), np.array(y_train).reshape(-1, max_length, 1)


def train_models(cnn_model, X_train, y_train, epochs=10, batch_size=32):
     for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        num_batches = int(np.ceil(len(X_train) / batch_size))

        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(X_train))
            X_batch = X_train[start_idx:end_idx]
            y_batch = y_train[start_idx:end_idx]

            # Reshape batches to match the input requirements explicitly
            X_batch = np.reshape(X_batch, (len(X_batch), -1, 1))
            y_batch = np.reshape(y_batch, (len(y_batch), -1, 1))

            # Train CNN
            cnn_loss = cnn_model.train_on_batch(X_batch, y_batch)

            # Generate fake examples
            # generated_samples = generator.predict(X_batch)
            # generated_samples_noisy = generated_samples + 0.05 * np.random.normal(loc=0.0, scale=1.0, size=generated_samples.shape)

            # Train discriminator
            # real_labels = np.ones((len(y_batch), 1))
            # fake_labels = np.zeros((len(generated_samples), 1))

            # d_loss_real = discriminator.train_on_batch(y_batch, real_labels)
            # d_loss_fake = discriminator.train_on_batch(generated_samples_noisy, fake_labels)
            # d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train generator via GAN
            # g_loss = gan.train_on_batch(X_batch, np.ones((len(X_batch), 1)))

            # print(f"Batch {i+1}/{num_batches}, CNN Loss: {cnn_loss}, D Loss: {d_loss}, G Loss: {g_loss}")

        print("-------------------------------")

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 18)

In [46]:
all_audio_files = {**real_audio_file_index, **noisy_audio_file_index}
max_length = find_max_length(all_audio_files)

X_train, y_train = prepare_dataset(real_audio_file_index, noisy_audio_file_index, max_length)


# Define model input shape
input_shape = (max_length, 1)

# Build and compile models
cnn_model = build_cnn_model(input_shape)
cnn_model.compile(optimizer='adam', loss='mse')

# generator = build_generator(input_shape)
# discriminator = build_discriminator(input_shape)
# discriminator.compile(optimizer='adam', loss='binary_crossentropy')

# gan = build_gan(generator, discriminator)
# gan.compile(optimizer='adam', loss='binary_crossentropy')

train_models(cnn_model, X_train, y_train)

ValueError: Input 0 of layer "conv2d_2" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (None, 302769, 1)

In [24]:
def process_audio(file_path, cnn_model, max_length, sr=16000):
    # Load and pad the audio file
    audio = load_and_pad_audio(file_path, max_length, sr)
    audio = audio.reshape(1, max_length, 1)  # Reshape for the model input

    # Apply the CNN to reduce noise
    denoised_audio = cnn_model.predict(audio)

    # Use the generator to regenerate details
    # regenerated_audio = generator.predict(denoised_audio)

    return denoised_audio.flatten() #, regenerated_audio.flatten()


In [31]:
# Playing the original denoised audio
ipd.display(ipd.Audio(load_and_pad_audio("./noisy1_SNRdb_0.0_clnsp1.wav", max_length), rate=16000))

# Playing the regenerated audio
ipd.display(ipd.Audio(process_audio("./noisy1_SNRdb_0.0_clnsp1.wav", cnn_model, max_length), rate=16000 ))

