### LibriSpeech Dataset

We will make a toy problem from the LibriSpeech (`train-clean-100`) dataset (speaker recognition with 10 speakers). To choose the speakers, we will manually select 10 IDs from the dataset (5 male and 5 female). From each of those 10 IDs, we will select N audio files.

In [5]:
import os
from tqdm import tqdm
from glob import glob

BASE_DATA_DIR = os.path.expanduser('~/voice_data/LibriSpeech')
TRAIN_100_DIR = os.path.join(BASE_DATA_DIR, 'train-clean-100/')

# sum should be maximum of 137
# this is due to the hand-picked speakers we chose
N_TRAIN = 100
N_DEV = 15
N_TEST = 20

In [6]:
train_100_speakers = os.listdir(TRAIN_100_DIR)
print(f'There are {len(train_100_speakers)} speakers in the train-clean-100 partition.')

map_100 = {}

for i, speaker in enumerate(tqdm(train_100_speakers)):

    # we expect unique speakers
    if speaker in map_100:
        raise NotImplementedError()
    
    speaker_data_dir = os.path.join(TRAIN_100_DIR, speaker)
    map_100[speaker] = len(glob(f'{speaker_data_dir}/**/*.flac', recursive=True))

print(map_100)

# which speakers have the most files?
map_100_sorted = dict(sorted(map_100.items(), key=lambda x: x[1], reverse=True))
print(map_100_sorted)

There are 251 speakers in the train-clean-100 partition.


  0%|          | 0/251 [00:00<?, ?it/s]

100%|██████████| 251/251 [00:00<00:00, 1382.72it/s]

{'1578': 118, '311': 122, '1926': 133, '6272': 115, '5808': 120, '26': 118, '1502': 136, '2007': 123, '2384': 90, '6415': 116, '1355': 98, '8051': 116, '6019': 112, '696': 108, '2989': 155, '8014': 76, '60': 97, '78': 118, '4441': 124, '5514': 87, '4813': 111, '1624': 101, '4297': 111, '8747': 110, '3486': 120, '6563': 92, '8063': 155, '8425': 114, '7367': 119, '7800': 115, '83': 123, '196': 108, '3699': 117, '458': 82, '6147': 117, '7078': 115, '4898': 110, '7148': 115, '332': 89, '2764': 120, '8468': 118, '7447': 114, '374': 113, '1743': 103, '4406': 130, '3879': 117, '2893': 107, '5750': 122, '7635': 122, '5390': 116, '211': 166, '6529': 107, '87': 108, '27': 138, '7505': 115, '7794': 106, '4859': 93, '1867': 137, '5393': 125, '3235': 113, '19': 111, '6880': 116, '6818': 107, '669': 118, '887': 115, '1040': 81, '6000': 88, '1363': 98, '831': 124, '103': 102, '3259': 126, '1723': 112, '4014': 165, '5322': 113, '40': 114, '1098': 92, '289': 86, '7312': 26, '3440': 113, '4788': 107, '7




In [7]:
select_IDs = ['211', '730', '2989', '4195', '125', '4014', '8063', '27', '1867', '118']

train_data = []
dev_data = []
test_data = []

for i, speaker in enumerate(select_IDs):
    speaker_data_dir = os.path.join(TRAIN_100_DIR, speaker)
    all_files = glob(f'{speaker_data_dir}/**/*.flac', recursive=True)

    train_data += [(i, fpath) for fpath in all_files[0:N_TRAIN]]
    dev_data += [(i, fpath) for fpath in all_files[N_TRAIN:(N_TRAIN + N_DEV)]]
    test_data += [(i, fpath) for fpath in all_files[(N_TRAIN + N_DEV):(N_TRAIN + N_DEV + N_TEST)]]

len(test_data), train_data[-3:]

(200,
 [(9,
   '/home/cameron/voice_data/LibriSpeech/train-clean-100/118/47824/118-47824-0063.flac'),
  (9,
   '/home/cameron/voice_data/LibriSpeech/train-clean-100/118/47824/118-47824-0055.flac'),
  (9,
   '/home/cameron/voice_data/LibriSpeech/train-clean-100/118/47824/118-47824-0058.flac')])

In [8]:
def write_data_to_file(save_file_path, data_list):
    with open(save_file_path, 'w') as f:
        for label, fpath in data_list:
            f.write(f'{label} {fpath}\n')

#write_data_to_file('../data_lists/librispeech_dev.txt', dev_data)