In [1]:
from datasets import load_dataset, Dataset
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("edinburghcstr/ami", "ihm")
ds = dataset["train"]

In [3]:
# Group utterances by meeting
meeting_groups = defaultdict(list)
for row in ds:
    meeting_groups[row["meeting_id"]].append(row)

In [4]:
def create_sequences(
    utterances, max_speakers=2, max_utts_per_speaker=3, max_duration=30.0
):
    sequences = []
    current_seq = []
    speaker_counts = defaultdict(int)
    duration = 0.0

    for utt in utterances:
        speaker = utt["speaker_id"]
        dur = utt["end_time"] - utt["begin_time"]

        if (
            (
                speaker not in speaker_counts.keys()
                or speaker_counts[speaker] < max_utts_per_speaker
            )
            and len(speaker_counts) < max_speakers
            and duration + dur <= max_duration
        ):
            current_seq.append(utt)
            speaker_counts[speaker] += 1
            duration += dur
            continue

        # If we get here, current utt did not fit - store current sequence
        if len(current_seq) > 0:
            sequences.append(current_seq)
        # Start a new sequence with the one that didn't fit
        current_seq = [utt]
        speaker_counts = defaultdict(int)
        speaker_counts[speaker] = 1
        duration = dur

    # Store the last sequence if it has any utterances
    if len(current_seq) > 0:
        sequences.append(current_seq)

    return sequences

In [5]:
all_sequences = []
for meeting_id, utterances in meeting_groups.items():
    utterances.sort(key=lambda x: x["begin_time"])
    sequences = create_sequences(utterances)
    all_sequences.extend(sequences)

In [6]:
for i in all_sequences[1]:
    print(f"Speaker: {i['speaker_id']}, Text: {i['text']}, Start: {i['begin_time']}, End: {i['end_time']}")
    # print(f"sampling rate: {i["audio"]["sampling_rate"]}")

Speaker: MEO069, Text: GOSH, Start: 5.599999904632568, End: 6.010000228881836
Speaker: MEO069, Text: 'KAY, Start: 10.479999542236328, End: 10.880000114440918
Speaker: MEE068, Text: DOES ANYONE WANT TO SEE UH STEVE'S FEEDBACK FROM THE SPECIFICATION, Start: 11.09000015258789, End: 15.529999732971191


In [7]:
import numpy as np

In [8]:
def build_transcript(seq):
    speaker_map = {}
    current_index = 1

    for utt in seq:
        spk = utt["speaker_id"]
        if spk not in speaker_map:
            speaker_map[spk] = f"[SPK{current_index}]"
            current_index += 1

    lines = [f"{speaker_map[utt['speaker_id']]} {utt['text']}" for utt in seq]
    return " ".join(lines)

In [9]:
combined_sequences = []
for i, seq in enumerate(all_sequences):
    audio = []
    sr = seq[0]["audio"]["sampling_rate"] # same for all utterances
    for utt in seq:
        audio.append(utt["audio"]["array"])
    audio = np.concatenate(audio)
    text = build_transcript(seq)
    combined_sequences.append({ "audio": { "array": audio, "sampling_rate": sr }, "text": text })
    # print(audio)
    # print(text)
    # if i == 3:
    #     break

In [10]:
print(combined_sequences[0:3])

[{'audio': {'array': array([0.00012207, 0.00015259, 0.00015259, ..., 0.00033569, 0.00030518,
       0.00030518], shape=(14560,)), 'sampling_rate': 16000}, 'text': "[SPK1] 'KAY [SPK2] OKAY"}, {'audio': {'array': array([-0.00109863, -0.00152588, -0.0012207 , ...,  0.0005188 ,
        0.00057983,  0.00061035], shape=(84000,)), 'sampling_rate': 16000}, 'text': "[SPK1] GOSH [SPK1] 'KAY [SPK2] DOES ANYONE WANT TO SEE UH STEVE'S FEEDBACK FROM THE SPECIFICATION"}, {'audio': {'array': array([-0.00021362, -0.00015259,  0.00012207, ..., -0.0010376 ,
       -0.00140381, -0.00167847], shape=(61120,)), 'sampling_rate': 16000}, 'text': '[SPK1] IS THERE MUCH MORE IN IT THAN HE D [SPK2] I I DRY READ IT THE LAST TIME'}]


In [11]:
import soundfile as sf
import os
import json

In [12]:

output_dir = "data/audio_sequences"
os.makedirs(output_dir, exist_ok=True)

manifest = []

for i, sample in enumerate(combined_sequences):
    audio_array = sample["audio"]["array"]
    sampling_rate = sample["audio"]["sampling_rate"]
    text = sample["text"]

    audio_path = os.path.join(output_dir, f"seq_{i}.wav")
    
    # Save as 16-bit PCM WAV
    sf.write(audio_path, audio_array, samplerate=sampling_rate, subtype='PCM_16')

    # Add to manifest
    manifest.append({
        "audio_path": audio_path,
        "text": text
    })

# Save manifest
with open("data/manifest.jsonl", "w") as f:
    for entry in manifest:
        f.write(json.dumps(entry) + "\n")