In [2]:
import os
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=1
import concurrent
from pathlib import Path

import librosa
import soundfile as sf
from tqdm.notebook import tqdm

In [5]:
def resample(from_path, to_path):
    if to_path.exists():
        return
    to_path.parent.mkdir(parents=True, exist_ok=True)
    try:
        wav, _ = librosa.load(from_path, sr=24000)
        sf.write(to_path, wav, 24000, subtype='PCM_16')
    except Exception as e:
        print(e)

# Jamendo

In [2]:
base_dir = Path('/home/shahn/Datasets/jamendo')
from_dir = base_dir / Path('raw_30s')
to_dir = base_dir / Path('raw_30s_24khz')
new_ext = '.wav'

In [4]:
total_num = 0
futures = []
with concurrent.futures.ProcessPoolExecutor(max_workers=64) as executor:
    for file in from_dir.rglob('*.mp3'):
        new_filename = to_dir / file.relative_to(from_dir).with_suffix(new_ext)
        total_num += 1
        futures.append(executor.submit(
            resample,
            file,
            new_filename
        ))
    for future in tqdm(concurrent.futures.as_completed(futures), dynamic_ncols=True, total=total_num):
        future.result()

  0%|                                                               | 0/55701 [00:00<?, ?it/s]

55701


# DNS4

In [6]:
base_dir = Path('/home/shahn/Datasets/DNS-Challenge4/datasets_fullband')
to_base_dir = Path('/home/shahn/Datasets/DNS-Challenge4/datasets_24khz')
new_ext = '.wav'

total_num = 0
futures = []
with concurrent.futures.ProcessPoolExecutor(max_workers=32) as executor:
    for path in ["clean_fullband", "noise_fullband", "dev_testset_fullband"]:
        from_dir = base_dir / Path(path)
        to_dir = to_base_dir / Path(path)
        for file in from_dir.rglob('*.wav'):
            new_filename = to_dir / file.relative_to(from_dir).with_suffix(new_ext)
            total_num += 1
            futures.append(executor.submit(
                resample,
                file,
                new_filename
            ))
            if total_num % 1000 == 0:
                print(f"\r{total_num}", end="", flush=True)
    for future in tqdm(concurrent.futures.as_completed(futures), dynamic_ncols=True, total=total_num):
        future.result()

890000Input signal length=0 is too small to resample from 48000->24000
891000Input signal length=0 is too small to resample from 48000->24000
Input signal length=0 is too small to resample from 48000->24000Input signal length=0 is too small to resample from 48000->24000

901000Input signal length=0 is too small to resample from 48000->24000
Input signal length=0 is too small to resample from 48000->24000
1173000

  0%|                                                               | 0/1173372 [00:00<?, ?it/s]

# VCTK

In [10]:
base_dir = Path('/home/shahn/Datasets/VCTK-0.92')
from_dir = base_dir / Path('wav48_silence_trimmed')
to_dir = base_dir / Path('wav24_silence_trimmed')
new_ext = '.wav'

total_num = 0
futures = []
with concurrent.futures.ProcessPoolExecutor(max_workers=32) as executor:
    for file in from_dir.rglob('*.flac'):
        new_filename = to_dir / file.relative_to(from_dir).with_suffix(new_ext)
        total_num += 1
        futures.append(executor.submit(
            resample,
            file,
            new_filename
        ))
        if total_num % 5000 == 0:
            print(f"\r{total_num}", end="", flush=True)
    for future in tqdm(concurrent.futures.as_completed(futures), dynamic_ncols=True, total=total_num):
        future.result()

85000

  0%|                                                                 | 0/88328 [00:00<?, ?it/s]