In [1]:
from metadata_utils import get_meta
from llm_utils import ask_llm
from transcript_utils import srt_to_text
from chunking_utils import chunker, token_counter
from chonkie import Chunk

In [2]:
from pathlib import Path

files = [file for file in Path("../files/rotl").iterdir() if ".srt" in file.name]

results_dir = Path("summaries_test")
results_dir.mkdir(exist_ok=True)

chunks_dir = Path("chunks")
chunks_dir.mkdir(exist_ok=True)


def who_is(subject, exclude=[]):
    results = []
    for file in files:
        file_name, episode_number, episode_date, episode_title = get_meta(file)
        text = srt_to_text(file)
        if subject in text and all(name not in text for name in exclude):
            chunks = chunker(text)
            for chunk in chunks:
                if isinstance(chunk, Chunk) and chunk.text.count(subject):
                    results.append(chunk.text)
    chunks_path = chunks_dir / f"{subject}_chunks.txt"
    with open(chunks_path, "w") as f:
        f.write("\n".join(results))
    question = f"Who is {subject}?"
    context = f"{"\n".join(results)}\n\n\n{question}"
    raw_answer = ask_llm(context)
    answer = raw_answer.strip().replace(". ", ".\n")
    out_path = results_dir / f"{subject}.txt"

    with open(out_path, "w") as f:
        f.write(answer)

    return answer

In [None]:
exclude = ["Eleanor Rigby", "Eleanor Roosevelt", "Eleanor Braun"]
answer = who_is("Eleanor", exclude=exclude)

print(answer)

In [3]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1", max_length=768)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [4]:
from pathlib import Path

files = [file for file in Path("../files/rotl").iterdir() if ".srt" in file.name]

labels = ["Person"]

def get_names(file):
    results = []
    file_name, episode_number, episode_date, episode_title = get_meta(file)
    transcript = srt_to_text(file)
    chunks = chunker(transcript)
    for chunk in chunks:
        entities = model.predict_entities(chunk.text, labels, threshold=0.5)
        for entity in entities:
            results.append(entity["text"])
    return results

In [None]:
names_dir = Path("names")
names_dir.mkdir(exist_ok=True)
for file in files:
    names = get_names(file)
    names_path = names_dir / file.name
    with open(names_path, "w") as f:
        f.write("\n".join(names))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
