In [16]:
from metadata_utils import get_meta
from llm_utils import ask_llm
from transcript_utils import srt_to_text
from chunking_utils import chunker, token_counter
from chonkie import Chunk

In [17]:
from pathlib import Path

files = [file for file in Path("../files/rotl").iterdir() if ".srt" in file.name]

results_dir = Path("summaries_test")
results_dir.mkdir(exist_ok=True)

chunks_dir = Path("chunks")
chunks_dir.mkdir(exist_ok=True)


def who_is(subject, exclude=[]):
    results = []
    for file in files:
        file_name, episode_number, episode_date, episode_title = get_meta(file)
        text = srt_to_text(file)
        if subject in text and all(name not in text for name in exclude):
            chunks = chunker(text)
            for chunk in chunks:
                if isinstance(chunk, Chunk) and chunk.text.count(subject):
                    results.append(chunk.text)
    chunks_path = chunks_dir / f"{subject}_chunks.txt"
    with open(chunks_path, "w") as f:
        f.write("\n".join(results))
    question = f"Who is {subject}?"
    context = f"{"\n".join(results)}\n\n\n{question}"
    raw_answer = ask_llm(context)
    answer = raw_answer.strip().replace(". ", ".\n")
    out_path = results_dir / f"{subject}.txt"

    with open(out_path, "w") as f:
        f.write(answer)

    return answer

In [18]:
exclude = ["Eleanor Rigby", "Eleanor Roosevelt", "Eleanor Braun"]
answer = who_is("Eleanor", exclude=exclude)

print(answer)

Eleanor is John Roderick's wife.

In the story, John and Eleanor are on a flight together, and John is trying to navigate the airport using a sign with a map on it.
Eleanor is worried about being late and doesn't want John to draw attention to themselves by trying to navigate.

John compares Eleanor to his mother, who is always punctual and never late.
He also mentions that Eleanor has been on the same flight for eight hours, which may explain her anxiety about being late.

Later in the story, John mentions that Eleanor forbade him from feeding the cats because there were too many of them and they were causing problems.
This suggests that Eleanor is a practical and no-nonsense person who is concerned about taking care of their home and family.

Overall, Eleanor is portrayed as a loving and caring wife who is also practical and concerned about being on time.
She is not afraid to stand up to her husband when she thinks he is being foolish, but she also cares about him and wants to protec

In [19]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1", max_length=768).to("cuda")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [23]:
from pathlib import Path

files = [file for file in Path("../files/rotl").iterdir() if ".srt" in file.name]

labels = ["Person"]

STOP_WORDS = [
    "merlin",
    "john",
    "i",
    "we",
    "you",
    "he",
    "her",
    "she",
    "it",
    "they",
    "one",
    "who",
    "what",
    "which",
]


def get_names(file):
    results = []
    file_name, episode_number, episode_date, episode_title = get_meta(file)
    transcript = srt_to_text(file)
    chunks = chunker(transcript)
    for chunk in chunks:
        entities = model.predict_entities(chunk.text, labels, threshold=0.5)
        for entity in entities:
            if entity["text"].lower() not in STOP_WORDS:
                results.append(entity["text"])
    return results

In [24]:
names_dir = Path("names")
names_dir.mkdir(exist_ok=True)
for file in files:
    names = get_names(file)
    names_path = names_dir / f"{file.stem}.txt"
    with open(names_path, "w") as f:
        f.write("\n".join(names))

In [25]:
from collections import Counter


def count_occurrences(nested_lists):
    flat_list = [
        item for sublist in nested_lists for item in sublist
    ]  # Flatten the list
    return dict(Counter(flat_list))


name_files = [file for file in Path("names").iterdir()]

all_names = []

for file in name_files:
    with open(file, "r") as f:
        names = f.read().splitlines()
        all_names.append(names)

name_counts = count_occurrences(all_names)

counts_list = [(name, count) for name, count in name_counts.items()]

sorted_counts = sorted(counts_list, key=lambda x: x[1], reverse=True)

counts_strings = [f"{name}|{count}" for name, count in sorted_counts]

with open("counts.txt", "w") as f:
    f.write("\n".join(counts_strings))

In [None]:
with open("counts.txt", "r") as f:
    lines = f.read().splitlines()
names = [line.split("|")[0] for line in lines]

len(names)

12833