In [1]:
import os
import glob
from zipfile import ZipFile
from string import ascii_uppercase
import pydub
import pandas as pd
import tqdm
import textgrids

In [2]:
def clean(annotation):
    annotation = annotation.replace("'", '').upper()  # removes ' as TIL data does not include it, change to all upper case as Wav2Vec2 tokenizer requires
    for c in annotation:  # replace all non ascii uppercase characters per TIL dataset
        if c not in ascii_uppercase + ' ':
            annotation = annotation.replace(c, '')
    return annotation.strip()

In [3]:
def IMDA_part12_to_TIL(part: int):
    PART_PATH = f'IMDA/part{part}/DATA/CHANNEL0'
    if part == 1:
        part = 0  # IMDA names all part 1 files with leading 0
    WAVE_PATH = os.path.join(PART_PATH, 'WAVE')
    SCRIPT_PATH = os.path.join(PART_PATH, 'SCRIPT')

    script_files = glob.glob(os.path.join(SCRIPT_PATH, '*.TXT'))

    for script in tqdm.tqdm(script_files):
        speaker = os.path.basename(script).split('.')[0][-2]
        session = os.path.basename(script).split('.')[0][-1]
        speaker_prefix = f'SPEAKER{part}{speaker.zfill(3)}'
        zip_file = speaker_prefix + '.zip'
        wav_root = os.path.join(speaker_prefix, f'SESSION{session}')
        with open(script, 'r') as f:
            lines = f.readlines()
            processed_lines = []
            for i in range(0, len(lines), 2):
                processed_lines.append(lines[i].split('\t')[0] + ' ' + lines[i+1].replace('\t', ' ').strip())
            processed_lines = [l.replace(chr(0xfeff), '').strip() for l in processed_lines]  # remove \ufeff which appears at the start of txt
        with ZipFile(os.path.join(WAVE_PATH, zip_file), 'r') as zip:
            for line in processed_lines:
                wav_fname = line.split(' ')[0] + '.WAV'
                data = zip.read(os.path.join(wav_root, wav_fname).replace('\\', '/'))
                audio = pydub.AudioSegment(data)
                annotation = ' '.join(line.split(' ')[1:])
                annotation = clean(annotation)
                if len(annotation.split()) >= 2 and 16000 * 2 < len(audio) < 16000 * 10:  # ignore all audio longer than 10 seconds and shorter than 2 second and single word audio
                    os.makedirs(f'IMDA_TIL/audio_part{part}', exist_ok=True)
                    target_path = os.path.join(f'IMDA_TIL/audio_part{part}', wav_fname).replace('\\', '/')
                    audio.export(target_path, format='wav')
                    dataset.append({'path': target_path, 'annotation': annotation})

In [5]:
dataset = []

In [7]:
IMDA_part12_to_TIL(1)
# IMDA_part12_to_TIL(2)  # part 2 is full of SG names, not considered english

In [10]:
def TextGridToTIL(wav_path, script_path, part: int):
    WAV_PATH = wav_path
    SCRIPT_PATH = script_path
    script_files = glob.glob(os.path.join(SCRIPT_PATH, '*.TextGrid'))
    for script in tqdm.tqdm(script_files):
        base_name = os.path.basename(script).split('.')[0]
        wav_fname = base_name + '.wav'
        wav_path = os.path.join(WAV_PATH, wav_fname)
        textgrid = textgrids.TextGrid(script)
        for _ in textgrid.keys():  # just get the first key since there is only 1 all the time but name changes
            textgrid = textgrid[_]
            break
        timeline = []
        for interval in textgrid:
            if interval.text != '<Z>':
                text = interval.text
                text = clean(text)
                if len(text.split()) >= 2 and 2 < interval.xmax - interval.xmin < 10:  # ignore all audio longer than 10 seconds and shorter than 2 second and single word audio
                    timeline.append({'start': interval.xmin, 'end': interval.xmax, 'text': text})
        if not os.path.isfile(wav_path):
            if part == 6:
                continue  # part 6 folder structure means not all audio files can be found within the same wav folder that this function is being called with
            else:
                raise FileNotFoundError(f'{wav_path} not found')

        audio = pydub.AudioSegment.from_wav(wav_path)
        i = 0
        for interval in timeline:
            start = int(interval['start'] * 1000)
            end = int(interval['end'] * 1000)
            annotation = interval['text']
            os.makedirs(f'IMDA_TIL/audio_part{part}', exist_ok=True)
            target_path = os.path.join(f'IMDA_TIL/audio_part{part}', base_name + str(i) + '.wav').replace('\\', '/')
            audio[start:end].export(target_path, format='wav')
            dataset.append({'path': target_path, 'annotation': annotation})
            i += 1

In [None]:
TextGridToTIL('IMDA/PART3/Audio Separate StandingMic', 'IMDA/PART3/Scripts Separate', 3)
TextGridToTIL('IMDA/PART3/Audio Same CloseMic', 'IMDA/PART3/Scripts Same', 3)
TextGridToTIL('IMDA/PART5/Debate Audio', 'IMDA/PART5/Debate Scripts', 5)

In [6]:
# part 5 finance + emotion is a bit different audio folder structure
def IMDA_part5_to_TIL(wav_path, script_path):
    speaker_folders = os.listdir(wav_path)
    for speaker_folder in tqdm.tqdm(speaker_folders):
        speaker_path = os.path.join(wav_path, speaker_folder)
        TextGridToTIL(speaker_path, script_path, 5)

IMDA_part5_to_TIL('IMDA/PART5/Finance + Emotion Audio', 'IMDA/PART5/Finance + Emotion Scripts')

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.60it/s][A
100%|██████████| 1/1 [00:00<00:00,  9.13it/s]


In [11]:
# part 6 different audio folder structure
wav_folders = glob.glob('IMDA/PART6/Call Centre Design */Audio/*/*/')
for wav_folder in tqdm.tqdm(wav_folders):
    wav_folder = wav_folder.replace('\\', '/')
    script_folder = wav_folder.split('Audio')[0] + 'Scripts/'
    TextGridToTIL(wav_folder, script_folder, 6)

  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 104.14it/s]

100%|██████████| 2/2 [00:00<00:00, 50.33it/s]
100%|██████████| 2/2 [00:00<00:00, 29.87it/s]


In [12]:
df = pd.DataFrame(dataset)
print(len(df))
df.to_csv('IMDA_TIL/IMDA_TIL.csv', index=False)