In [15]:
import os
import shutil
import pickle as pkl

ROOT = os.getcwd()

output_dir = os.path.join(ROOT, "output")

output_dirs = [
    (os.path.join(output_dir, dir, subdir), dir, subdir)
    for dir in os.listdir(output_dir)
    for subdir in os.listdir(os.path.join(output_dir, dir))
    if os.path.isdir(os.path.join(output_dir, dir))
    if ".DS_Store" not in subdir
]

output_files = [
    (os.path.join(dir, file), file, showname, episode)
    for dir, showname, episode in output_dirs
    for file in os.listdir(dir)
    if os.path.isfile(os.path.join(dir, file))
    if ".txt" in file
    if ".DS_Store" not in file
]

rotl = sorted(
    [path for path, filename, showname, episode in output_files if showname == "rotl"]
)

roadwork = sorted(
    [
        path
        for path, filename, showname, episode in output_files
        if showname == "roadwork"
    ]
)


def getMissing(list):
    last_element = sorted(list)[-1]

    for num in range(1, int(last_element) + 1):
        padded = str(num).rjust(3, "0")
        if padded not in list:
            print(padded)


ROOT = os.getcwd()
transcripts_dir = os.path.join(ROOT, "transcripts")


def copyTranscripts():

    roadwork_dir = os.path.join(transcripts_dir, "roadwork")
    rotl_dir = os.path.join(transcripts_dir, "rotl")
    os.makedirs(roadwork_dir, exist_ok=True)
    os.makedirs(rotl_dir, exist_ok=True)

    for file in output_files:
        path, filename, showname, episode = file
        shutil.copy(path, os.path.join(transcripts_dir, showname, filename))


def getSentences(path):
    transcript = (
        open(path, encoding="utf-8-sig").read().replace("\n\n", "\n").splitlines()
    )
    # splitSentences = [line.split(": ")[1] for line in transcript]
    sentences = [line for line in transcript]
    return sentences


sentences = getSentences(roadwork[0])

sentences[-1]

'Speaker 1: Yeah, thank you for your submission. All right, great. Thank you.'

In [219]:
segments_dir = os.path.join(ROOT, "sample", "speaker_outputs")
embeddings_dir = os.path.join(segments_dir, "embeddings")

embeddings_files = [
    os.path.join(embeddings_dir, file)
    for file in os.listdir(embeddings_dir)
    if os.path.isfile(os.path.join(embeddings_dir, file))
]

sizes = [
    pkl.load(
        open(
            file,
            "rb",
        )
    )[
        "mono_file"
    ].size()[0]
    for file in embeddings_files
]


nested_embeddings = [
    pkl.load(
        open(
            file,
            "rb",
        )
    )["mono_file"]
    for file in embeddings_files
]

embeddings = [tensor for tensors in nested_embeddings for tensor in tensors]

embeddings[0]

tensor([-1.1826e-02, -7.9285e-02,  2.1631e-01, -5.4321e-02,  9.2285e-02,
         3.1319e-03, -1.8835e-01,  4.5013e-02, -1.5332e-01,  5.6274e-02,
        -3.7598e-02,  1.2585e-01,  7.9407e-02, -1.3321e-02, -1.1670e-01,
         2.7649e-02, -1.7792e-02, -2.1069e-01,  1.3763e-02, -3.3936e-02,
        -1.6998e-02, -8.0627e-02,  2.6733e-02,  1.1200e-01, -6.3293e-02,
        -1.3599e-01,  1.3281e-01, -1.1908e-01,  5.6488e-02, -8.1787e-02,
         1.0852e-01, -5.6976e-02,  4.8676e-02, -5.0934e-02, -1.9318e-02,
        -2.8412e-02,  1.0175e-01,  2.4866e-01,  7.1838e-02,  1.2932e-02,
         4.6349e-03, -8.7708e-02,  7.0251e-02, -1.3477e-01, -3.2593e-02,
        -4.1779e-02,  8.6975e-03,  1.5076e-01,  1.0651e-01,  6.9656e-03,
         1.6431e-01, -2.0996e-02, -4.5654e-02, -2.1667e-02,  2.2473e-01,
        -9.9411e-03,  3.1113e-02,  7.9956e-03, -2.0615e-02,  6.4392e-02,
        -1.3123e-01, -9.4116e-02,  7.5928e-02,  1.3220e-01,  1.7532e-02,
         4.6883e-03, -1.7960e-02,  4.6570e-02, -1.0

In [18]:
import os
import json

ROOT = os.getcwd()
segments_dir = os.path.join(ROOT, "sample", "speaker_outputs")

segments_files = [
    os.path.join(segments_dir, file)
    for file in os.listdir(segments_dir)
    if ".json" in file
    if os.path.isfile(os.path.join(segments_dir, file))
]

segments = [
    json.loads(line)
    for file in segments_files
    for line in open(file).read().splitlines()
]

segments[0]

{'audio_filepath': '/home/ansel/dev/diarization-pipeline/temp_outputs/mono_file.wav',
 'offset': 6.54,
 'duration': 0.13999999999999968,
 'label': 'UNK',
 'uniq_id': None}

In [10]:
labels_file = os.path.join(segments_dir, "subsegments_scale4_cluster.label")

labels = [
    (float(line.split(" ")[1]), float(line.split(" ")[2]), line.split(" ")[3])
    for line in open(labels_file).read().splitlines()
]

labels[0]

(6.539999961853027, 6.679999828338623, 'speaker_1')