In [12]:
from open_whisper import audio
import multiprocessing
from tqdm import tqdm
import os
import numpy

In [13]:
audio_files = []
for root, dirs, files in os.walk("data/audio"):
    if "segments" in root:
        audio_files.extend([os.path.join(root, f) for f in os.listdir(root)])

In [14]:
len(audio_files)

619382

In [15]:
audio_files[:10]

['data/audio/-Spgba1gDg4/segments/00:01:48.060_00:01:53.910.m4a',
 'data/audio/-Spgba1gDg4/segments/00:00:06.220_00:00:29.750.m4a',
 'data/audio/-Spgba1gDg4/segments/00:01:19.330_00:01:48.060.m4a',
 'data/audio/-Spgba1gDg4/segments/00:00:59.750_00:01:19.330.m4a',
 'data/audio/-Spgba1gDg4/segments/00:00:29.750_00:00:59.750.m4a',
 'data/audio/9OYNyr8enD4/segments/00:04:05.250_00:04:18.950.m4a',
 'data/audio/9OYNyr8enD4/segments/00:00:09.050_00:00:38.640.m4a',
 'data/audio/9OYNyr8enD4/segments/00:01:01.739_00:01:29.880.m4a',
 'data/audio/9OYNyr8enD4/segments/00:03:13.140_00:03:42.239.m4a',
 'data/audio/9OYNyr8enD4/segments/00:02:53.730_00:03:17.190.m4a']

In [16]:
def get_mel(audio_file):
    audio_arr = audio.load_audio(audio_file, sr=16000)
    audio_arr = audio.pad_or_trim(audio_arr)
    mel_spec = audio.log_mel_spectrogram(audio_arr)
    return mel_spec

In [17]:
mel_spec = get_mel(audio_files[0])
mel_spec

tensor([[ 0.1160,  0.1530,  0.4855,  ..., -0.9567, -0.9567, -0.9567],
        [ 0.6158,  0.7153,  0.5615,  ..., -0.9567, -0.9567, -0.9567],
        [ 0.4551,  0.8930,  0.9121,  ..., -0.9567, -0.9567, -0.9567],
        ...,
        [-0.9567, -0.6724, -0.5740,  ..., -0.9567, -0.9567, -0.9567],
        [-0.9451, -0.6921, -0.6158,  ..., -0.9567, -0.9567, -0.9567],
        [-0.9567, -0.8314, -0.6260,  ..., -0.9567, -0.9567, -0.9567]])

In [20]:
mel_arr = mel_spec.numpy()
numpy.save("data/mel_spec.npy", mel_arr)

In [21]:
def get_mel(audio_file):
    audio_arr = audio.load_audio(audio_file, sr=16000)
    audio_arr = audio.pad_or_trim(audio_arr)
    mel_spec = audio.log_mel_spectrogram(audio_arr)
    return (audio_file, mel_spec)

In [24]:
with multiprocessing.Pool() as pool:
    file_mel = list(
        tqdm(
            pool.imap_unordered(get_mel, audio_files[:10]), total=len(audio_files[:10])
        )
    )

  0%|          | 0/10 [01:08<?, ?it/s]


KeyboardInterrupt: 

In [25]:
def get_mel(audio_file):
    audio_arr = audio.load_audio(audio_file, sr=16000)
    audio_arr = audio.pad_or_trim(audio_arr)
    mel_spec = audio.log_mel_spectrogram(audio_arr)
    return mel_spec

In [26]:
with multiprocessing.Pool() as pool:
    file_mel = list(
        tqdm(
            pool.imap_unordered(get_mel, audio_files[:10]), total=len(audio_files[:10])
        )
    )

  0%|          | 0/10 [00:14<?, ?it/s]


KeyboardInterrupt: 

In [27]:
def get_mel(audio_file):
    audio_arr = audio.load_audio(audio_file, sr=16000)
    audio_arr = audio.pad_or_trim(audio_arr)
    return audio_arr

In [28]:
with multiprocessing.Pool() as pool:
    file_mel = list(
        tqdm(
            pool.imap_unordered(get_mel, audio_files[:10]), total=len(audio_files[:10])
        )
    )

100%|██████████| 10/10 [00:00<00:00, 63.60it/s]


In [30]:
import torch

DEVICE = torch.device("cuda:0")

In [31]:
def get_mel(audio_file):
    audio_arr = audio.load_audio(audio_file, sr=16000)
    audio_arr = audio.pad_or_trim(audio_arr)
    mel_spec = audio.log_mel_spectrogram(audio_arr, device=DEVICE)
    return (audio_file, mel_spec)

In [32]:
with multiprocessing.Pool() as pool:
    file_mel = list(
        tqdm(
            pool.imap_unordered(get_mel, audio_files[:10]), total=len(audio_files[:10])
        )
    )

100%|██████████| 10/10 [00:01<00:00,  6.50it/s]


In [36]:
file_mel

[('data/audio/9OYNyr8enD4/segments/00:00:09.050_00:00:38.640.m4a',
  tensor([[-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
          [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
          [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
          ...,
          [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
          [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
          [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770]],
         device='cuda:0')),
 ('data/audio/9OYNyr8enD4/segments/00:04:05.250_00:04:18.950.m4a',
  tensor([[ 0.3252,  0.2922,  0.2629,  ..., -0.5691, -0.5691, -0.5691],
          [ 0.5234,  0.7157,  0.7435,  ..., -0.5691, -0.5691, -0.5691],
          [ 0.6741,  0.7174,  0.9868,  ..., -0.5691, -0.5691, -0.5691],
          ...,
          [-0.2623, -0.0566,  0.0435,  ..., -0.5691, -0.5691, -0.5691],
          [-0.3610, -0.1951, -0.0284,  ..., -0.5691, -0.5691, -0.5691],
          [-0.5

In [34]:
def get_mel(audio_file):
    audio_arr = audio.load_audio(audio_file, sr=16000)
    audio_arr = audio.pad_or_trim(audio_arr)
    mel_spec = audio.log_mel_spectrogram(audio_arr, device=torch.device("cpu"))
    return (audio_file, mel_spec)

In [35]:
with multiprocessing.Pool() as pool:
    file_mel = list(
        tqdm(
            pool.imap_unordered(get_mel, audio_files[:10]), total=len(audio_files[:10])
        )
    )

  0%|          | 0/10 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [37]:
audio_files, mel_specs = zip(*file_mel)

In [38]:
audio_files

('data/audio/9OYNyr8enD4/segments/00:00:09.050_00:00:38.640.m4a',
 'data/audio/9OYNyr8enD4/segments/00:04:05.250_00:04:18.950.m4a',
 'data/audio/9OYNyr8enD4/segments/00:02:53.730_00:03:17.190.m4a',
 'data/audio/-Spgba1gDg4/segments/00:00:29.750_00:00:59.750.m4a',
 'data/audio/-Spgba1gDg4/segments/00:01:19.330_00:01:48.060.m4a',
 'data/audio/-Spgba1gDg4/segments/00:00:06.220_00:00:29.750.m4a',
 'data/audio/-Spgba1gDg4/segments/00:01:48.060_00:01:53.910.m4a',
 'data/audio/9OYNyr8enD4/segments/00:03:13.140_00:03:42.239.m4a',
 'data/audio/9OYNyr8enD4/segments/00:01:01.739_00:01:29.880.m4a',
 'data/audio/-Spgba1gDg4/segments/00:00:59.750_00:01:19.330.m4a')

In [39]:
mel_specs

(tensor([[-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         ...,
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770]],
        device='cuda:0'),
 tensor([[ 0.3252,  0.2922,  0.2629,  ..., -0.5691, -0.5691, -0.5691],
         [ 0.5234,  0.7157,  0.7435,  ..., -0.5691, -0.5691, -0.5691],
         [ 0.6741,  0.7174,  0.9868,  ..., -0.5691, -0.5691, -0.5691],
         ...,
         [-0.2623, -0.0566,  0.0435,  ..., -0.5691, -0.5691, -0.5691],
         [-0.3610, -0.1951, -0.0284,  ..., -0.5691, -0.5691, -0.5691],
         [-0.5691, -0.4011, -0.3263,  ..., -0.5691, -0.5691, -0.5691]],
        device='cuda:0'),
 tensor([[ 0.1043, -0.2558,  0.2228,  ..., -0.5524, -0.5524, -0.55

In [40]:
list(mel_specs)

[tensor([[-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         ...,
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770],
         [-0.4770, -0.4770, -0.4770,  ..., -0.4770, -0.4770, -0.4770]],
        device='cuda:0'),
 tensor([[ 0.3252,  0.2922,  0.2629,  ..., -0.5691, -0.5691, -0.5691],
         [ 0.5234,  0.7157,  0.7435,  ..., -0.5691, -0.5691, -0.5691],
         [ 0.6741,  0.7174,  0.9868,  ..., -0.5691, -0.5691, -0.5691],
         ...,
         [-0.2623, -0.0566,  0.0435,  ..., -0.5691, -0.5691, -0.5691],
         [-0.3610, -0.1951, -0.0284,  ..., -0.5691, -0.5691, -0.5691],
         [-0.5691, -0.4011, -0.3263,  ..., -0.5691, -0.5691, -0.5691]],
        device='cuda:0'),
 tensor([[ 0.1043, -0.2558,  0.2228,  ..., -0.5524, -0.5524, -0.55