In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# changing the working directory
%cd '/content/drive/MyDrive/project_without_artifact'

/content/drive/MyDrive/project_without_artifact


In [3]:
!ls

feature_extraction_training_and_valdiation_hw.ipynb    Training_MS_PCEN_default
feature_extraction_training_and_valdiation_nohw.ipynb  Validation_Audio_Files
model.ipynb					       Validation_MS_dB
Training_Audio_Files				       Validation_MS_PCEN
Training_MS_dB					       Validation_MS_PCEN_default
Training_MS_PCEN


### MS-PCEN for Training (HW)

In [4]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

audio_dir = r'/content/drive/MyDrive/project_without_artifact/Training_Audio_Files/HW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Training_MS_PCEN/HW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=9.99686079e-01, bias=3.17143741e+00, power=3.22058271e-01, time_constant=2.51005792e-03, eps=5.09658383e-07, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 54199C05_200.wav is processed: 1/355
The data sample 543CF1B9_270.wav is processed: 2/355
The data sample 5438CF85_280.wav is processed: 3/355
The data sample 54179031_100.wav is processed: 4/355
The data sample 5438A8DA_130.wav is processed: 5/355
The data sample 5428E165_200.wav is processed: 6/355
The data sample 5415764D_100.wav is processed: 7/355
The data sample 5423A5F1_200.wav is processed: 8/355
The data sample 5415764D_150.wav is processed: 9/355
The data sample 54498285_150.wav is processed: 10/355
The data sample 5432CF32_100.wav is processed: 11/355
The data sample 543FFAAD_280.wav is processed: 12/355
The data sample 542E00B9_0.wav is processed: 13/355
The data sample 54178CAE_200.wav is processed: 14/355
The data sample 54199C05_60.wav is processed: 15/355
The data sample 542455B9_250.wav is processed: 16/355
The data sample 543FFAAD_170.wav is processed: 17/355
The data sample 543CF1B9_200.wav is processed: 18/355
The data sample 543FFAAD_50.wav is proce

### MS-PCEN for Validation (HW)

In [5]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_Audio_Files/HW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_MS_PCEN/HW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=9.99686079e-01, bias=3.17143741e+00, power=3.22058271e-01, time_constant=2.51005792e-03, eps=5.09658383e-07, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 54498285_200.wav is processed: 1/100
The data sample 54336D64_0.wav is processed: 2/100
The data sample 54363E1A_120.wav is processed: 3/100
The data sample 542D8CB6_160.wav is processed: 4/100
The data sample 542C5AD9_190.wav is processed: 5/100
The data sample 541EF71D_50.wav is processed: 6/100
The data sample 542D8CB6_250.wav is processed: 7/100
The data sample 542D8CB6_0.wav is processed: 8/100
The data sample 541EF71D_140.wav is processed: 9/100
The data sample 542C5AD9_0.wav is processed: 10/100
The data sample 541F598E_50.wav is processed: 11/100
The data sample 54274D1A_60.wav is processed: 12/100
The data sample 53CF14B9_0.wav is processed: 13/100
The data sample 542C5AD9_150.wav is processed: 14/100
The data sample 542C5AD9_50.wav is processed: 15/100
The data sample 54419D09_100.wav is processed: 16/100
The data sample 542C5AD9_100.wav is processed: 17/100
The data sample 541F598E_200.wav is processed: 18/100
The data sample 541EF71D_170.wav is processed: 19

### Mel-spectrogram (dB) for Training (HW)

In [6]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Training_Audio_Files/HW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Training_MS_dB/HW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = 20 * np.log10(spectrogram + 1e-6)

    plt.figure(figsize=(8, 4))
    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 54199C05_200.wav is processed: 1/355
The data sample 543CF1B9_270.wav is processed: 2/355
The data sample 5438CF85_280.wav is processed: 3/355
The data sample 54179031_100.wav is processed: 4/355
The data sample 5438A8DA_130.wav is processed: 5/355
The data sample 5428E165_200.wav is processed: 6/355
The data sample 5415764D_100.wav is processed: 7/355
The data sample 5423A5F1_200.wav is processed: 8/355
The data sample 5415764D_150.wav is processed: 9/355
The data sample 54498285_150.wav is processed: 10/355
The data sample 5432CF32_100.wav is processed: 11/355
The data sample 543FFAAD_280.wav is processed: 12/355
The data sample 542E00B9_0.wav is processed: 13/355
The data sample 54178CAE_200.wav is processed: 14/355
The data sample 54199C05_60.wav is processed: 15/355
The data sample 542455B9_250.wav is processed: 16/355
The data sample 543FFAAD_170.wav is processed: 17/355
The data sample 543CF1B9_200.wav is processed: 18/355
The data sample 543FFAAD_50.wav is proce

### Mel-spectrogram (dB) for Validation (HW)

In [7]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_Audio_Files/HW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_MS_dB/HW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = 20 * np.log10(spectrogram + 1e-6)

    plt.figure(figsize=(8, 4))
    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 54498285_200.wav is processed: 1/100
The data sample 54336D64_0.wav is processed: 2/100
The data sample 54363E1A_120.wav is processed: 3/100
The data sample 542D8CB6_160.wav is processed: 4/100
The data sample 542C5AD9_190.wav is processed: 5/100
The data sample 541EF71D_50.wav is processed: 6/100
The data sample 542D8CB6_250.wav is processed: 7/100
The data sample 542D8CB6_0.wav is processed: 8/100
The data sample 541EF71D_140.wav is processed: 9/100
The data sample 542C5AD9_0.wav is processed: 10/100
The data sample 541F598E_50.wav is processed: 11/100
The data sample 54274D1A_60.wav is processed: 12/100
The data sample 53CF14B9_0.wav is processed: 13/100
The data sample 542C5AD9_150.wav is processed: 14/100
The data sample 542C5AD9_50.wav is processed: 15/100
The data sample 54419D09_100.wav is processed: 16/100
The data sample 542C5AD9_100.wav is processed: 17/100
The data sample 541F598E_200.wav is processed: 18/100
The data sample 541EF71D_170.wav is processed: 19

### MS-PCEN with default for Training (HW)

In [8]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Training_Audio_Files/HW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Training_MS_PCEN_default/HW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=0.98, bias=2, power=0.5, time_constant=0.4, eps=1e-06, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 54199C05_200.wav is processed: 1/355
The data sample 543CF1B9_270.wav is processed: 2/355
The data sample 5438CF85_280.wav is processed: 3/355
The data sample 54179031_100.wav is processed: 4/355
The data sample 5438A8DA_130.wav is processed: 5/355
The data sample 5428E165_200.wav is processed: 6/355
The data sample 5415764D_100.wav is processed: 7/355
The data sample 5423A5F1_200.wav is processed: 8/355
The data sample 5415764D_150.wav is processed: 9/355
The data sample 54498285_150.wav is processed: 10/355
The data sample 5432CF32_100.wav is processed: 11/355
The data sample 543FFAAD_280.wav is processed: 12/355
The data sample 542E00B9_0.wav is processed: 13/355
The data sample 54178CAE_200.wav is processed: 14/355
The data sample 54199C05_60.wav is processed: 15/355
The data sample 542455B9_250.wav is processed: 16/355
The data sample 543FFAAD_170.wav is processed: 17/355
The data sample 543CF1B9_200.wav is processed: 18/355
The data sample 543FFAAD_50.wav is proce

### MS-PCEN with default for Validation (HW)

In [9]:
audio_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_Audio_Files/HW'
spectrogram_dir = r'/content/drive/MyDrive/project_without_artifact/Validation_MS_PCEN_default/HW'

os.makedirs(spectrogram_dir, exist_ok=True)

count = 0

for audio_file in os.listdir(audio_dir):
    count = count + 1
    print(f"The data sample {audio_file} is processed: {count}/{len(os.listdir(audio_dir))}")
    audio_path = os.path.join(audio_dir, audio_file)
    audio, sr = librosa.load(audio_path, sr=6000)

    # Mel
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=2048, hop_length=512, win_length=2048, n_mels=64, power=1, center=False) # I think he was using 128 as default
    spectrogram = np.concatenate((spectrogram, spectrogram), axis=1)
    mel_pcen = librosa.pcen(spectrogram*(2**31), sr=sr, hop_length=512, gain=0.98, bias=2, power=0.5, time_constant=0.4, eps=1e-06, max_size=1)
    mel_pcen = mel_pcen[:, mel_pcen.shape[1]//2:]
    plt.figure(figsize=(8, 4))
    librosa.display.specshow(mel_pcen, sr=sr, x_axis='time', y_axis='mel')
    plt.tight_layout()
    file_name = os.path.splitext(audio_file)[0]
    save_path = os.path.join(spectrogram_dir, f'{file_name}.png')
    plt.axis('off')
    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    # Following 6 lines are to convert CYMK images to RGB images with the grayscale values ranging between 0-255
    spec_image = Image.open(save_path)
    spec_image_rgb = spec_image.convert('RGB')
    rgb_array = np.array(spec_image_rgb)
    rgb_array_norm = 255*((rgb_array-np.min(rgb_array))/(np.max(rgb_array)-np.min(rgb_array)))
    rgb_image_spec = Image.fromarray(rgb_array_norm.astype(np.uint8))
    rgb_image_spec.save(save_path)

The data sample 54498285_200.wav is processed: 1/100
The data sample 54336D64_0.wav is processed: 2/100
The data sample 54363E1A_120.wav is processed: 3/100
The data sample 542D8CB6_160.wav is processed: 4/100
The data sample 542C5AD9_190.wav is processed: 5/100
The data sample 541EF71D_50.wav is processed: 6/100
The data sample 542D8CB6_250.wav is processed: 7/100
The data sample 542D8CB6_0.wav is processed: 8/100
The data sample 541EF71D_140.wav is processed: 9/100
The data sample 542C5AD9_0.wav is processed: 10/100
The data sample 541F598E_50.wav is processed: 11/100
The data sample 54274D1A_60.wav is processed: 12/100
The data sample 53CF14B9_0.wav is processed: 13/100
The data sample 542C5AD9_150.wav is processed: 14/100
The data sample 542C5AD9_50.wav is processed: 15/100
The data sample 54419D09_100.wav is processed: 16/100
The data sample 542C5AD9_100.wav is processed: 17/100
The data sample 541F598E_200.wav is processed: 18/100
The data sample 541EF71D_170.wav is processed: 19