In [1]:
import os
import glob
from zipfile import ZipFile
from string import ascii_uppercase
import pydub
import pandas as pd
import tqdm

In [2]:
def clean(annotation):
    annotation = annotation.replace("'", '').upper()  # removes ' as TIL data does not include it, change to all upper case as Wav2Vec2 tokenizer requires
    for c in annotation:  # replace all non ascii uppercase characters per TIL dataset
        if c not in ascii_uppercase + ' ':
            annotation = annotation.replace(c, '')
    return annotation

In [5]:
def IMDA_part12_to_TIL(PART: int):
    PART_PATH = f'IMDA/PART{PART}/DATA/CHANNEL0'
    if PART == 1:
        PART = 0  # IMDA names all part 1 files with leading 0
    WAVE_PATH = os.path.join(PART_PATH, 'WAVE')
    SCRIPT_PATH = os.path.join(PART_PATH, 'SCRIPT')

    script_files = glob.glob(os.path.join(SCRIPT_PATH, '*.TXT'))

    for script in tqdm.tqdm(script_files):
        speaker = os.path.basename(script).split('.')[0][-2]
        session = os.path.basename(script).split('.')[0][-1]
        speaker_prefix = f'SPEAKER{PART}{speaker.zfill(3)}'
        zip_file = speaker_prefix + '.zip'
        wav_root = os.path.join(speaker_prefix, f'SESSION{session}')
        with open(script, 'r') as f:
            lines = f.readlines()
            processed_lines = []
            for i in range(0, len(lines), 2):
                processed_lines.append(lines[i].split('\t')[0] + ' ' + lines[i+1].replace('\t', ' ').strip())
            processed_lines = [l.replace(chr(0xfeff), '').strip() for l in processed_lines]  # remove \ufeff which appears at the start of txt
        with ZipFile(os.path.join(WAVE_PATH, zip_file), 'r') as zip:
            for line in processed_lines:
                wav_fname = line.split(' ')[0] + '.WAV'
                data = zip.read(os.path.join(wav_root, wav_fname).replace('\\', '/'))
                audio = pydub.AudioSegment(data)
                if len(audio) > 16000 * 10:  # ignore all audio longer than 10 seconds
                    continue
                annotation = ' '.join(line.split(' ')[1:])
                annotation = clean(annotation)
                if len(annotation) == 0:  # ignore all audio with empty annotation
                    continue
                target_path = os.path.join('IMDA_TIL/audio', wav_fname).replace('\\', '/')
                audio.export(target_path, format='wav')
                dataset.append({'path': target_path, 'annotation': annotation})

In [6]:
dataset = []
IMDA_part12_to_TIL(1)
# IMDA_part12_to_TIL(2)  # part 2 is full of SG names, not considered english

100%|██████████| 4/4 [00:01<00:00,  3.32it/s]
100%|██████████| 4/4 [00:01<00:00,  2.17it/s]


In [7]:
df = pd.DataFrame(dataset)
df.to_csv('IMDA_TIL/IMDA_TIL.csv', index=False)