# Kaldi style data directory creation

The kaldi style data directory is required to use the dataset for ESPnet2 finetuning.

<u>Directory structure:</u>

`data/
  train/
    - text     # The transcription
    - wav.scp  # Wave file path
    - utt2spk  # A file mapping utterance-id to speaker-id
    - spk2utt  # A file mapping speaker-id to utterance-id
    - segments # [Option] Specifying start and end time of each utterance
  dev/
    ...
  test/
    ...`

In [2]:
# Imports
from pathlib import Path
from tqdm import tqdm
from more_itertools import ilen
from collections import defaultdict
import math
import os
from os.path import join

### Text transcription

`uttidA <transcription>
uttidB <transcription>
...`

In [20]:
data_dir = Path("/mnt/U/Datasets/lrs3pretrain/processed/text/")
text_transcription_path = Path("/mnt/U/Datasets/lrs3pretrain/processed/data_dir/text")

In [23]:
lines = []
with tqdm(desc="Generating uttid -> transcription file") as pbar:
    for dirpath, _, files in os.walk(data_dir):
        if files:
            speaker_name = dirpath.split("/")[-1]
            for file in files:
                utt_name = file.split(".")[0]
                file_path = join(dirpath, file)
                with open(file_path, 'r') as f:
                    line = f.readline()
                lines += [f"{speaker_name}_{utt_name} {line}\n"]
        pbar.update(len(files))

Generating uttid -> transcription file: 267225it [00:20, 13060.08it/s]


In [22]:
if not text_transcription_path.parent.exists():
    text_transcription_path.parent.mkdir(parents=True)
with open(text_transcription_path, 'w') as f:
    f.writelines(lines)

### wav.scp generation

`uttidA /path/to/uttidA.wav
uttidB /path/to/uttidB.wav
...
`

In [27]:
wav_dir = Path("/mnt/U/Datasets/lrs3pretrain/processed/audio_segments/pretrain")
wav_scp_path = Path("/mnt/U/Datasets/lrs3pretrain/processed/data_dir/wav.scp")

In [29]:
lines = []
with tqdm(desc="Generating uttid -> wav path file") as pbar:
    for dirpath, _, files in os.walk(wav_dir):
        if files:
            speaker_name = dirpath.split("/")[-1]
            for file in files:
                utt_name = file.split(".")[0]
                file_path = join(dirpath, file)
                lines += [f"{speaker_name}_{utt_name} {file_path}\n"]
        pbar.update(len(files))

Generating uttid -> wav path file: 260174it [00:00, 308806.20it/s]


In [28]:
with open(wav_scp_path, 'w') as f:
    f.writelines(lines)

### utt2spk generation

`uttidA speakerA
uttidB speakerB
uttidC speakerA
uttidD speakerB
...
`

In [32]:
utt2spk_path = Path("/mnt/U/Datasets/lrs3pretrain/processed/data_dir/utt2spk")

In [36]:
lines = []
with tqdm(desc="Generating utt2spk file") as pbar:
    for dirpath, _, files in os.walk(data_dir):
        if files:
            speaker_name = dirpath.split("/")[-1]
            for file in files:
                utt_name = file.split(".")[0]
                lines += [f"{speaker_name}_{utt_name} {speaker_name}\n"]
        pbar.update(len(files))

Generating utt2spk file: 267225it [00:00, 405316.48it/s]


In [34]:
with open(utt2spk_path, 'w') as f:
    f.writelines(lines)

### spk2utt generation
`speakerA uttidA uttidC ...
speakerB uttidB uttidD ...
...
`

In [35]:
spk2utt_path = Path("/mnt/U/Datasets/lrs3pretrain/processed/data_dir/utt2spk")

In [38]:
lines = []
with tqdm(desc="Generating spk2utt file") as pbar:
    for dirpath, _, files in os.walk(data_dir):
        if files:
            spk_files = []
            speaker_name = dirpath.split("/")[-1]
            for file in files:
                utt_name = file.split(".")[0]
                spk_files += [f"{speaker_name}_{utt_name}"]
            lines += [f"{speaker_name} {' '.join(spk_files)}\n"]
        pbar.update(len(files))

Generating spk2utt file: 267225it [00:00, 432371.70it/s]


In [40]:
with open(spk2utt_path, 'w') as f:
    f.writelines(lines)