# LRS3 Split audio and transcriptions into sections of 3.5 to 8.5 seconds

The idea here is to look at the transcription for each utterance, where the timings of the words are declared. 

Using this information,we will split the audio files as well as the transcriptions into utterances of more equal length.

We will take segments of ~5 seconds
length by always splitting after the first word that crosses the n\*5 seconds mark. If the last segment were to be less than 3.5
seconds long, it will be added to the previous segment.

## Imports

In [None]:
from pathlib import Path
from tqdm import tqdm
from more_itertools import ilen
from collections import defaultdict
import math
from multiprocessing import Pool

In [None]:
data_dir = Path("/mnt/U/Datasets/lrs3pretrain/raw/pretrain")
audio_data_dir = Path("/mnt/U/Datasets/lrs3pretrain/processed/audio/pretrain/")
text_dir = Path("/mnt/U/Datasets/lrs3pretrain/processed/text/pretrain/")
segments_dir = Path("/mnt/U/Datasets/lrs3pretrain/processed/audio_segments/pretrain/")

In [None]:
speakers = list(data_dir.glob("*"))
utt_per_spk = {}
for speaker in speakers:
    aud_speaker = audio_data_dir / speaker.name
    wavs = list(aud_speaker.glob("*.wav"))
    txts = [data_dir / speaker.name / (wav.stem + ".txt") for wav in wavs]
    utt_per_spk[speaker.name] = (wavs, txts)

## Example transcription file:
    Text:  TO SHOW IT TO YOU NOT BECAUSE I WANT TO GIVE YOU THE KIND OF STARBUCKS TOUR OF HISTORIC ENGLAND BUT 
    Conf:  2

    WORD START END ASDSCORE
    TO 0.09 0.15 12.9
    SHOW 0.15 0.37 12.7
    IT 0.37 0.47 8.2
    TO 0.47 0.53 6.6
    YOU 0.53 0.61 6.6
    NOT 0.61 0.80 7.6
    BECAUSE 0.80 1.01 7.1
    I 1.01 1.09 6.3
    WANT 1.09 1.24 6.1
    TO 1.24 1.30 6.3
    GIVE 1.30 1.40 6.0
    YOU 1.40 1.49 4.6
    THE 1.49 1.59 3.7
    KIND 1.59 1.89 3.9
    OF 2.14 2.40 8.8
    STARBUCKS 2.40 3.04 8.4
    TOUR 3.04 3.45 6.5
    OF 3.45 3.66 4.2
    HISTORIC 3.84 4.24 7.3
    ENGLAND 4.24 4.73 6.6
    BUT 4.88 5.37 6.2

In [None]:
def get_word_times(text_path):
    out = []
    with open(text_path, "r") as f:
        lines = [x.strip() for x in f.readlines()]
        for word_data in lines[4:]:
            word, start, end, _ = word_data.split(" ")
            out.append((word, start, end))
    return out

def get_segments(word_times, min_len=3.5, seg_len=5.0):
    segs = []
    last = 0
    total_time = float(word_times[-1][-1])
    for i in range(1,math.ceil(total_time/seg_len)+1):
        seg_start = None
        seg_words = []
        for x in range(last, len(word_times)):
            word, start, end = word_times[x]
            seg_words += [word]
            if not seg_start:
                seg_start = start
            if float(end) > i*seg_len:
                segs.append((seg_words, float(seg_start), float(end)))
                last = x+1
                break
            elif x == len(word_times)-1:
                segs.append((seg_words, float(seg_start), float(end)))
    last_words, last_start, last_end = segs[-1]
    if (last_end - last_start <= min_len) and (last != len(word_times)) and (len(segs) > 1):
        del segs[-1]
        p_last_words, p_last_start, p_last_end = segs[-1]
        p_last_words += last_words
        p_last_end = last_end
        segs[-1] = (p_last_words, p_last_start, p_last_end)
    segs = [(" ".join(words), start, end) for words, start, end in segs]
    return segs

In [None]:
spk_utts = []
for spk, (utts, texts) in utt_per_spk.items():
    spk_utts += [(spk, utts[i], texts[i]) for i in range(len(utts))]

In [None]:
segments = []
for spk, utt, text in tqdm(spk_utts):
    word_times = get_word_times(text)
    segs = get_segments(word_times)
    for i, (txt, start, end) in enumerate(segs):
        aud_path = segments_dir / spk / f"{utt.stem}_{i}.wav"
        if not aud_path.exists():
            if not aud_path.parent.exists():
                aud_path.parent.mkdir(parents=True, exist_ok=True)
            txt_path = text_dir / spk / f"{text.stem}_{i}.txt"
            if not txt_path.parent.exists():
                txt_path.parent.mkdir(parents=True, exist_ok=True)
            if not txt_path.exists():
                with open(txt_path, "w") as f:
                    f.writelines([txt])
            segments.append((utt, aud_path, start, end))        

In [None]:
def to_time(t):
    minutes = math.floor(t/60)
    seconds = t - minutes * 60
    ms = t % 1.0
    return f"00:{minutes:02.0f}:{int(seconds):02.0f}.{(t-int(t))*1000:03.0f}"

def cut_segment(data):
    utt, aud_path, start, end = data
    from_string = to_time(start)
    to_string = to_time(end)
    !ffmpeg -i {utt} -ss {from_string} -to {to_string} -codec copy {aud_path} -loglevel quiet

In [None]:
p = Pool(32)
list(tqdm(p.imap_unordered(cut_segment, segments), total=len(segments)))