In [None]:
import json
from pathlib import Path
import logging
import time

stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)

logging.getLogger("httpx").setLevel(logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.FileHandler("logs.txt"), stream_handler],
)

import chromadb
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import (
    SentenceTransformerEmbeddingFunction,
)
from gliner import GLiNER
from pydantic import BaseModel

from chunking_utils import get_chunks
from llm_utils import ask_llm
from metadata_utils import get_meta
from nlp_utils import get_entities, get_tags, get_relevant_chunks
from transcript_utils import srt_to_text


class Entity(BaseModel):
    start: int
    end: int
    text: str
    label: str
    score: float


model = GLiNER.from_pretrained("urchade/gliner_base", max_length=768)

LLM_MODEL = "qwen2.5:14b"

In [None]:
chromadb_dir = Path("chromadb")
chromadb_dir.mkdir(exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(chromadb_dir))
collection = chroma_client.get_or_create_collection(name="roderick")

In [None]:
files = [
    file for file in sorted(Path("../files/rotl").iterdir()) if ".srt" in file.name
]

for file in files[:1]:
    file_name, episode_number, episode_date, episode_title = get_meta(file)
    transcript = srt_to_text(file)
    chunks = get_chunks(transcript)
    start_entities = time.time()
    results = get_entities(chunks, model)

    logging.info(
        f"rotl {episode_number} entities generated {(time.time()-start_entities):.1f}s"
    )
    for entity, data in results.items():
        start_entity = time.time()
        labels = data["labels"]
        indexes = data["indexes"]
        relevant_chunks = get_relevant_chunks(chunks, indexes)

        context = "\n".join(relevant_chunks)
        question = f"What do John and Merlin say about {entity}?"
        start_answer = time.time()
        answer = ask_llm(f"{context}\n\n{question}", model=LLM_MODEL, tokens=500)

        tags = get_tags(answer, model, stopwords=["john", "merlin"])

        doc = f"{entity}\n\n{", ".join(labels)}\n\n{", ".join(tags)}\n\n{answer}"

        id = f"{entity}_rotl_{episode_number}"
        metadata = {
            "chunks": ",".join([str(i) for i in indexes]),
            "show": "Roderick on the Line",
            "episode": episode_number,
            "title": episode_title,
            "subject": entity,
            "category": ",".join(labels),
            "tags": ",".join(tags),
        }

        collection.add(documents=[doc], ids=[id], metadatas=[metadata])
        logging.info(
            f"rotl {episode_number} {entity} inserted {(time.time()-start_entity):.1f}s"
        )

In [None]:
query = ""
results = collection.query(query_texts=[query], n_results=10)

docs = results["documents"][0]