In [1]:
import json
from pathlib import Path

from gliner import GLiNER
from pydantic import BaseModel

from metadata_utils import get_meta
from llm_utils import ask_llm
from transcript_utils import srt_to_text
from chunking_utils import chunker, token_counter

model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1", max_length=768).to("cuda")


class Entity(BaseModel):
    start: int
    end: int
    text: str
    label: str
    score: float

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [None]:
STOP_WORDS = [
    "friday",
    "he",
    "her",
    "i",
    "it",
    "miss",
    "monday",
    "one",
    "saturday",
    "she",
    "sunday",
    "they",
    "thursday",
    "today",
    "tomorrow",
    "tuesday",
    "we",
    "wednesday",
    "what",
    "which",
    "who",
    "you",
]
LABELS = [
    "Date",
    "Demographic Group",
    "Event",
    "Geo-Political Entity",
    "Location",
    "MusicalGroup",
    "Nationality, Religious, or Political Group",
    "Organization",
    "Person",
    "Product",
    "Time",
    "Work of Art",
]

files = [
    file for file in sorted(Path("../files/rotl").iterdir()) if ".srt" in file.name
]

LLM_MODEL = "qwen2.5:32b"

out = Path(f"test_{LLM_MODEL}")
out.mkdir(exist_ok=True)

for file in files:
    results = {}
    file_name, episode_number, episode_date, episode_title = get_meta(file)
    if episode_number == "398":
        transcript = srt_to_text(file)
        all_chunks = [chunk.text for chunk in chunker(transcript)]
        for i, chunk in enumerate(all_chunks):
            dialogue_only = chunk.replace("Merlin: ", "").replace("John: ", "")
            entities = model.predict_entities(dialogue_only, LABELS, threshold=0.5)
            for entity in entities:
                ner_entity = entity["text"]
                ner_label = entity["label"]
                if ner_entity.lower() not in STOP_WORDS:
                    if ner_entity.lower() in results:
                        results[ner_entity.lower()]["keys"].add(ner_entity)
                        results[ner_entity.lower()]["labels"].add(ner_label)
                        results[ner_entity.lower()]["indexes"].update([i - 1, i, i + 1])
                    else:
                        results[ner_entity.lower()] = {
                            "keys": set([ner_entity]),
                            "labels": set([ner_label]),
                            "indexes": set([i - 1, i, i + 1]),
                        }

        for key, data in sorted(results.items()):
            main_entity = sorted(list(data["keys"]))[0]
            labels = data["labels"]
            indexes = sorted(
                [i for i in data["indexes"] if 0 < i < len(all_chunks) - 1]
            )
            chunks = []
            for i in indexes:
                chunks.append(all_chunks[i])
            out_path = out / f"{main_entity}.txt"
            context = "\n".join(chunks)
            question = f"What are John and Merlin saying about {main_entity}?"
            answer = ask_llm(f"{context}\n\n{question}", model=LLM_MODEL, tokens=500)
            tags = set()
            entities = model.predict_entities(answer, LABELS, threshold=0.5)
            for entity in entities:
                ner_entity = entity["text"]
                if ner_entity.lower() not in STOP_WORDS and ner_entity.lower() not in [
                    "john",
                    "merlin",
                ]:
                    tags.add(ner_entity)

            labels_text = ",".join(list(labels))
            tags_text = ", ".join(sorted(list(tags)))
            indexes_text = ", ".join([str(i) for i in indexes])
            answer_text = answer.replace("\n\n", "\n")
            with open(out_path, "w", encoding="utf-8") as f:
                f.write(
                    f"{main_entity}\n\n{labels_text}\n\nTags: {tags_text}\n\n{answer_text}\n\nChunks: {indexes_text}"
                )

BadRequestError: Error code: 400 - {'error': {'message': 'model is required', 'type': 'api_error', 'param': None, 'code': None}}