# Inspect audios

In [3]:
import os
import numpy as np
import pickle
import json
import random
from tqdm import tqdm
import torchaudio

def print_metadata(metadata, src=None):
  if src:
    print("-" * 10)
    print("Source:", src)
    print("-" * 10)
  print(" - sample_rate:", metadata.sample_rate)
  print(" - num_channels:", metadata.num_channels)
  print(" - num_frames:", metadata.num_frames)
  print(" - bits_per_sample:", metadata.bits_per_sample)
  print(" - encoding:", metadata.encoding)
  print()

def inspect_file(path):
    print("-" * 10)
    print("Source:", path)
    print("-" * 10)
    print(f" - File size: {os.path.getsize(path)} bytes")
    print_metadata(torchaudio.info(path))


In [13]:
inspect_file("/home/yrb/code/speechbrain/data/ESC-50/audio/1-137-A-32.wav")

----------
Source: /home/yrb/code/speechbrain/data/ESC-50/audio/1-137-A-32.wav
----------
 - File size: 160044 bytes
 - sample_rate: 16000
 - num_channels: 1
 - num_frames: 80000
 - bits_per_sample: 16
 - encoding: PCM_S



In [14]:
inspect_file("/home/yrb/code/speechbrain/data/voxceleb/vox1/wav/id10001/1zcIwhmdeo4/00001.wav")

----------
Source: /home/yrb/code/speechbrain/data/voxceleb/vox1/wav/id10001/1zcIwhmdeo4/00001.wav
----------
 - File size: 259886 bytes
 - sample_rate: 16000
 - num_channels: 1
 - num_frames: 129921
 - bits_per_sample: 16
 - encoding: PCM_S



# audio normalization

In [12]:
import pydub
import os


def normalize_audio(path, save_path):
    """
    Normalize audio file.
    """
    audio = pydub.AudioSegment.from_wav(path)
    audio = audio.set_sample_width(2).set_channels(1).set_frame_rate(16000)
    normalized_audio = audio.normalize()
    normalized_audio.export(save_path, format="wav")

def find_all_files_with_extension(path, extension):
    """
    Walk and find all files with a given extension in a directory.
    """
    files = []
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith(extension):
                files.append(os.path.join(root, filename))
    return files

def normalize_all_audio(path):
    """
    Normalize all audio files in a directory.
    """
    files = find_all_files_with_extension(path, ".wav")
    for f in tqdm(files):
        normalize_audio(f, f)

print("Normalizing db...")
db_path = "/home/yrb/code/speechbrain/data/ESC-50/audio"
normalize_all_audio(db_path)



Normalizing db...


100%|██████████| 2000/2000 [00:04<00:00, 450.20it/s]


# Breakdown large speakerID.npy into one utter per .npy

In [15]:
import os
import numpy as np
import pickle
import json
import random
from tqdm import tqdm

def break_into_single_npy(db, db_single):
    npy_list = os.listdir(db)
    spkrID_list = [f[:-4] for f in npy_list if f.endswith(".npy")]
    files = [os.path.join(db, f) for f in npy_list if f.endswith(".npy")]
    for spkr_i, utter_p in enumerate(tqdm(files)):
        data = np.load(utter_p)
        for utter_i in range(data.shape[0]):
            # get random float between 0 and 1
            cur_utter = data[utter_i, :, :].reshape(1, data.shape[1], data.shape[2])
            # save utter to db_single
            np.save(os.path.join(db_single, spkrID_list[spkr_i] + "_" + str(utter_i) + ".npy"), cur_utter)

In [17]:
# db = "/home/yrb/code/speechbrain/data/voxceleb/vox1_test/spmel"
# db_single = "/home/yrb/code/speechbrain/data/voxceleb/vox1_test/spmel_single"

# break_into_single_npy(db, db_single)

# db = "/home/yrb/code/speechbrain/data/voxceleb/vox1/spmel"
# db_single = "/home/yrb/code/speechbrain/data/voxceleb/vox1/spmel_single"

# break_into_single_npy(db, db_single)

# db = "/home/yrb/code/speechbrain/data/voxceleb/vox2/spmel"
# db_single = "/home/yrb/code/speechbrain/data/voxceleb/vox2/spmel_single"

# break_into_single_npy(db, db_single)

db = "/home/yrb/code/speechbrain/data/ESC-50/spmel"
db_single = "/home/yrb/code/speechbrain/data/ESC-50/spmel_single"

break_into_single_npy(db, db_single)

100%|██████████| 2000/2000 [00:00<00:00, 4150.25it/s]


# Generate Speaker list

In [1]:
vox2_db = "/home/yrb/code/speechbrain/data/voxceleb/vox2/spmel"
vox1_db = "/home/yrb/code/speechbrain/data/voxceleb/vox1/spmel"
vox1test_db = "/home/yrb/code/speechbrain/data/voxceleb/vox1_test/spmel"

import os
import json

def get_speaker_list(db):
    files = os.listdir(db)
    files.sort()
    spkrID_list = [f[:-4] for f in files if f.endswith(".npy")]
    spkr2id = {}
    for i, spkr in enumerate(spkrID_list):
        spkr2id[spkr] = i
    return spkr2id

def save_as_json(db, json_path):
    spkr2id = get_speaker_list(db)
    with open(json_path, "w") as f:
        json.dump(spkr2id, f)

save_as_json(vox2_db, os.path.join(vox2_db, "../spkr2id.json"))
save_as_json(vox1_db, os.path.join(vox1_db, "../spkr2id.json"))
save_as_json(vox1test_db, os.path.join(vox1test_db, "../spkr2id.json"))

# Noise Generation

In [28]:
import os
import numpy as np
import pickle
import json
import random
from tqdm import tqdm

vox1_db = "/home/yrb/code/speechbrain/data/voxceleb/vox1/spmel_single"
vox2_db = "/home/yrb/code/speechbrain/data/voxceleb/vox2/spmel_single"
noise_level = 75
random.seed(noise_level)
np.random.seed(noise_level)

## Permute

In [7]:
npy_list = os.listdir(vox2_db)
spkrID_list = list(json.load(open(os.path.join(vox2_db, "../spkr2id.json"), "r")).keys())

npy_list.sort()
spkrID_list.sort()

mislabel_dict = {}
for f in tqdm(npy_list):
    # get random float between 0 and 1
    if random.random() <= noise_level/100:
        key = f[:-4]
        # sample a random speaker
        mislabel_dict[key] = random.sample(spkrID_list, 1)[0]

100%|██████████| 2918286/2918286 [00:01<00:00, 2085779.61it/s]


In [6]:
# save to json
with open(f"/home/yrb/code/speechbrain/data/jsons/Permute/voxceleb2_{noise_level}%_mislabel.json", "w") as f:
    json.dump(mislabel_dict, f)

## Open

In [18]:
npy_list = os.listdir(vox2_db)
spkrID_list = list(json.load(open(os.path.join(vox2_db, "../spkr2id.json"), "r")).keys())
ood_list = os.listdir(vox1_db)

npy_list.sort()
spkrID_list.sort()
ood_list.sort()

mislabel_dict = {}
for f in tqdm(npy_list):
    # get random float between 0 and 1
    if random.random() <= noise_level/100:
        key = f[:-4]
        # sample a random file
        mislabel_dict[key] = random.sample(ood_list, 1)[0]

100%|██████████| 2918286/2918286 [00:03<00:00, 772566.28it/s]


In [19]:
# save to json
with open(f"/home/yrb/code/speechbrain/data/jsons/Open/voxceleb2_{noise_level}%_mislabel.json", "w") as f:
    json.dump(mislabel_dict, f)

## Mix

In [29]:
npy_list = os.listdir(vox2_db)
spkrID_list = list(json.load(open(os.path.join(vox2_db, "../spkr2id.json"), "r")).keys())
ood_list = os.listdir(vox1_db)

npy_list.sort()
spkrID_list.sort()
ood_list.sort()

mislabel_dict = {}
for f in tqdm(npy_list):
    # get random float between 0 and 1
    if random.random() <= noise_level/100:
        key = f[:-4]
        if random.random() <= 0.5:
            # sample a random speaker
            mislabel_dict[key] = random.sample(spkrID_list, 1)[0]
        else:
            # sample a random file
            mislabel_dict[key] = random.sample(ood_list, 1)[0]

100%|██████████| 2918286/2918286 [00:03<00:00, 772987.97it/s]


In [30]:
# save to json
with open(f"/home/yrb/code/speechbrain/data/jsons/Mix/voxceleb2_{noise_level}%_mislabel.json", "w") as f:
    json.dump(mislabel_dict, f)