In [1]:
from dotenv import load_dotenv
import os
import shutil
from semantic_router.encoders import OpenAIEncoder
from semantic_router.splitters import RollingWindowSplitter
from semantic_router.utils.logger import logger
from transcriptTools import getTranscriptFiles, transcript_to_srt, reindex
from diarize import srt_to_transcript

ROOT = os.getcwd()

load_dotenv()

logger.setLevel("WARNING")  # reduce logs from splitter

encoder = OpenAIEncoder(name="text-embedding-3-small")

In [2]:
transcriptDir = os.path.join(ROOT, "labeled")
files = getTranscriptFiles(transcriptDir)

truncatedDir = os.path.join(ROOT, "truncated")
os.makedirs(truncatedDir, exist_ok=True)
os.makedirs(os.path.join(truncatedDir, "rotl"), exist_ok=True)
os.makedirs(os.path.join(truncatedDir, "roadwork"), exist_ok=True)


for file in files:
    filepath, showname, filename = file
    outpath = os.path.join(truncatedDir, showname, filename)
    if showname == "roadwork":
        shutil.copyfile(filepath, outpath)
    else:
        try:
            transcript = srt_to_transcript(filepath)
            filtered = [
                (idx, start, end, speaker, speech)
                for idx, start, end, speaker, speech in transcript
                if speech != ""
            ]
            reindexed = reindex(filtered)
            content_with_speaker = [
                speech for idx, start, end, speaker, speech in reindexed
            ]
            splitter = RollingWindowSplitter(
                encoder=encoder,
                dynamic_threshold=True,
                min_split_tokens=30,
                max_split_tokens=500,
                window_size=2,
                plot_splits=False,  # set this to true to visualize chunking
                enable_statistics=False,  # to print chunking stats
            )

            splits = splitter(content_with_speaker)
            chunks = [split.docs for split in splits]
            first_chunk_length = len(chunks[0])
            truncated_transcript = [
                (idx, start, end, speaker, speech)
                for idx, start, end, speaker, speech in reindexed
                if int(idx) > first_chunk_length
            ]
            srt = transcript_to_srt(truncated_transcript)
            f = open(outpath, "w")
            f.write(srt)
            f.close()
        except Exception as error:
            print(filename)

In [16]:
truncatedDir = os.path.join(ROOT, "truncated")
chunkedDir = os.path.join(ROOT, "chunked")

os.makedirs(chunkedDir, exist_ok=True)
os.makedirs(os.path.join(chunkedDir, "rotl"), exist_ok=True)
os.makedirs(os.path.join(chunkedDir, "roadwork"), exist_ok=True)

files = getTranscriptFiles(truncatedDir)

for file in files:
    filepath, showname, filename = file
    outpath = os.path.join(chunkedDir, showname, filename)
    transcript = srt_to_transcript(filepath)
    content_with_speaker = [
        f"{speaker}: {speech} " for idx, start, end, speaker, speech in transcript
    ]
    splitter = RollingWindowSplitter(
        encoder=encoder,
        dynamic_threshold=True,
        min_split_tokens=100,
        max_split_tokens=500,
        window_size=2,
        plot_splits=False,  # set this to true to visualize chunking
        enable_statistics=False,  # to print chunking stats
    )

    splits = splitter(content_with_speaker)
    chunks = ["\n".join(split.docs) for split in splits]
    text = "\n\n".join(chunks)
    outpath = os.path.join(ROOT, "chunked", showname, filename.replace(".srt", ".txt"))
    f = open(outpath, "w")
    f.write(text)
    f.close()