In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# changing the working directory
%cd '/content/drive/MyDrive/project_without_artifact'

/content/drive/MyDrive/project_without_artifact


In [3]:
!ls

feature_extraction_training_and_valdiation_hw.ipynb    Training_MS_PCEN_default
feature_extraction_training_and_valdiation_nohw.ipynb  Validation_Audio_Files
model.ipynb					       Validation_MS_dB
Training_Audio_Files				       Validation_MS_PCEN
Training_MS_dB					       Validation_MS_PCEN_default
Training_MS_PCEN


### MS-PCEN for Training (NoHW)

In [4]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

audio_dir = r'/content/drive/MyDrive/project_without_artifact/Training_Audio_Files/NoHW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Training_MS_PCEN/NoHW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=9.99686079e-01, bias=3.17143741e+00, power=3.22058271e-01, time_constant=2.51005792e-03, eps=5.09658383e-07, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 5347A925_150.wav is processed: 1/310
The data sample 5369F72D_130.wav is processed: 2/310
The data sample 534F59E6_200.wav is processed: 3/310
The data sample 53978F0D_0.wav is processed: 4/310
The data sample 53CAA1A9_200.wav is processed: 5/310
The data sample 53CE1EA1_0.wav is processed: 6/310
The data sample 5385A6AD_230.wav is processed: 7/310
The data sample 5453DD4E_280.wav is processed: 8/310
The data sample 53A0A2E2_0.wav is processed: 9/310
The data sample 53A0A2E2_200.wav is processed: 10/310
The data sample 5439C921_280.wav is processed: 11/310
The data sample 534F59E6_250.wav is processed: 12/310
The data sample 538B8055_30.wav is processed: 13/310
The data sample 5377B2D6_220.wav is processed: 14/310
The data sample 534F59E6_120.wav is processed: 15/310
The data sample 5377B2D6_0.wav is processed: 16/310
The data sample 5350AEEA_120.wav is processed: 17/310
The data sample 53CE1EA1_190.wav is processed: 18/310
The data sample 5367F5E6_200.wav is processed:

### MS-PCEN for Validation (NoHW)

In [5]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_Audio_Files/NoHW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_MS_PCEN/NoHW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=9.99686079e-01, bias=3.17143741e+00, power=3.22058271e-01, time_constant=2.51005792e-03, eps=5.09658383e-07, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 5367F5E6_50.wav is processed: 1/100
The data sample 53D94FDD_130.wav is processed: 2/100
The data sample 53D94FDD_0.wav is processed: 3/100
The data sample 53F2A9B5_120.wav is processed: 4/100
The data sample 53698329_200.wav is processed: 5/100
The data sample 53D42981_100.wav is processed: 6/100
The data sample 5354B881_150.wav is processed: 7/100
The data sample 5367F5E6_0.wav is processed: 8/100
The data sample 53472711_130.wav is processed: 9/100
The data sample 53F2A9B5_200.wav is processed: 10/100
The data sample 54261435_130.wav is processed: 11/100
The data sample 54261435_280.wav is processed: 12/100
The data sample 52ED3E45_140.wav is processed: 13/100
The data sample 53663769_240.wav is processed: 14/100
The data sample 538C05ED_170.wav is processed: 15/100
The data sample 52AB2916_0.wav is processed: 16/100
The data sample 5354B881_40.wav is processed: 17/100
The data sample 53908005_10.wav is processed: 18/100
The data sample 53472711_70.wav is processed: 

### Mel-spectrogram (dB) for Training (NoHW)

In [6]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Training_Audio_Files/NoHW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Training_MS_dB/NoHW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = 20 * np.log10(spectrogram + 1e-6)

    plt.figure(figsize=(8, 4))
    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 5347A925_150.wav is processed: 1/310
The data sample 5369F72D_130.wav is processed: 2/310
The data sample 534F59E6_200.wav is processed: 3/310
The data sample 53978F0D_0.wav is processed: 4/310
The data sample 53CAA1A9_200.wav is processed: 5/310
The data sample 53CE1EA1_0.wav is processed: 6/310
The data sample 5385A6AD_230.wav is processed: 7/310
The data sample 5453DD4E_280.wav is processed: 8/310
The data sample 53A0A2E2_0.wav is processed: 9/310
The data sample 53A0A2E2_200.wav is processed: 10/310
The data sample 5439C921_280.wav is processed: 11/310
The data sample 534F59E6_250.wav is processed: 12/310
The data sample 538B8055_30.wav is processed: 13/310
The data sample 5377B2D6_220.wav is processed: 14/310
The data sample 534F59E6_120.wav is processed: 15/310
The data sample 5377B2D6_0.wav is processed: 16/310
The data sample 5350AEEA_120.wav is processed: 17/310
The data sample 53CE1EA1_190.wav is processed: 18/310
The data sample 5367F5E6_200.wav is processed:

### Mel-spectrogram (dB) for Validation (NoHW)

In [7]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_Audio_Files/NoHW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_MS_dB/NoHW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = 20 * np.log10(spectrogram + 1e-6)

    plt.figure(figsize=(8, 4))
    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 5367F5E6_50.wav is processed: 1/100
The data sample 53D94FDD_130.wav is processed: 2/100
The data sample 53D94FDD_0.wav is processed: 3/100
The data sample 53F2A9B5_120.wav is processed: 4/100
The data sample 53698329_200.wav is processed: 5/100
The data sample 53D42981_100.wav is processed: 6/100
The data sample 5354B881_150.wav is processed: 7/100
The data sample 5367F5E6_0.wav is processed: 8/100
The data sample 53472711_130.wav is processed: 9/100
The data sample 53F2A9B5_200.wav is processed: 10/100
The data sample 54261435_130.wav is processed: 11/100
The data sample 54261435_280.wav is processed: 12/100
The data sample 52ED3E45_140.wav is processed: 13/100
The data sample 53663769_240.wav is processed: 14/100
The data sample 538C05ED_170.wav is processed: 15/100
The data sample 52AB2916_0.wav is processed: 16/100
The data sample 5354B881_40.wav is processed: 17/100
The data sample 53908005_10.wav is processed: 18/100
The data sample 53472711_70.wav is processed: 

### MS-PCEN with default for Training (NoHW)

In [8]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Training_Audio_Files/NoHW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Training_MS_PCEN_default/NoHW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=0.98, bias=2, power=0.5, time_constant=0.4, eps=1e-06, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 5347A925_150.wav is processed: 1/310
The data sample 5369F72D_130.wav is processed: 2/310
The data sample 534F59E6_200.wav is processed: 3/310
The data sample 53978F0D_0.wav is processed: 4/310
The data sample 53CAA1A9_200.wav is processed: 5/310
The data sample 53CE1EA1_0.wav is processed: 6/310
The data sample 5385A6AD_230.wav is processed: 7/310
The data sample 5453DD4E_280.wav is processed: 8/310
The data sample 53A0A2E2_0.wav is processed: 9/310
The data sample 53A0A2E2_200.wav is processed: 10/310
The data sample 5439C921_280.wav is processed: 11/310
The data sample 534F59E6_250.wav is processed: 12/310
The data sample 538B8055_30.wav is processed: 13/310
The data sample 5377B2D6_220.wav is processed: 14/310
The data sample 534F59E6_120.wav is processed: 15/310
The data sample 5377B2D6_0.wav is processed: 16/310
The data sample 5350AEEA_120.wav is processed: 17/310
The data sample 53CE1EA1_190.wav is processed: 18/310
The data sample 5367F5E6_200.wav is processed:

### MS-PCEN with default for Validation (NoHW)

In [9]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_Audio_Files/NoHW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_MS_PCEN_default/NoHW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=0.98, bias=2, power=0.5, time_constant=0.4, eps=1e-06, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 5367F5E6_50.wav is processed: 1/100
The data sample 53D94FDD_130.wav is processed: 2/100
The data sample 53D94FDD_0.wav is processed: 3/100
The data sample 53F2A9B5_120.wav is processed: 4/100
The data sample 53698329_200.wav is processed: 5/100
The data sample 53D42981_100.wav is processed: 6/100
The data sample 5354B881_150.wav is processed: 7/100
The data sample 5367F5E6_0.wav is processed: 8/100
The data sample 53472711_130.wav is processed: 9/100
The data sample 53F2A9B5_200.wav is processed: 10/100
The data sample 54261435_130.wav is processed: 11/100
The data sample 54261435_280.wav is processed: 12/100
The data sample 52ED3E45_140.wav is processed: 13/100
The data sample 53663769_240.wav is processed: 14/100
The data sample 538C05ED_170.wav is processed: 15/100
The data sample 52AB2916_0.wav is processed: 16/100
The data sample 5354B881_40.wav is processed: 17/100
The data sample 53908005_10.wav is processed: 18/100
The data sample 53472711_70.wav is processed: 