In [1]:
from datetime import timedelta
import json
from pathlib import Path
import logging
import time

stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)

logging.getLogger("httpx").setLevel(logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.FileHandler("logs.txt"), stream_handler],
)

import chromadb
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import (
    SentenceTransformerEmbeddingFunction,
)
from gliner import GLiNER
from pydantic import BaseModel

from chunking_utils import get_chunks
from llm_utils import ask_llm
from metadata_utils import get_meta
from nlp_utils import get_entities, get_tags, get_relevant_chunks
from transcript_utils import srt_to_text


class Entity(BaseModel):
    start: int
    end: int
    text: str
    label: str
    score: float


model = GLiNER.from_pretrained("urchade/gliner_base", max_length=768)

LLM_MODEL = "qwen2.5:14b"

Folder does not exist locally, attempting to use huggingface hub.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



In [2]:
chromadb_dir = Path("chromadb")
chromadb_dir.mkdir(exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(chromadb_dir))
collection = chroma_client.get_or_create_collection(name="roderick")

Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [None]:
files = [
    file for file in sorted(Path("../files/rotl").iterdir()) if ".srt" in file.name
]

for file in files[:1]:
    start_episode = time.time()

    file_name, episode_number, episode_date, episode_title = get_meta(file)
    transcript = srt_to_text(file)
    chunks = get_chunks(transcript)
    metadatas = [
        {
            "chunks": str(i),
            "show": "Roderick on the Line",
            "episode": episode_number,
            "title": episode_title,
            "subject": "",
            "category": "",
            "tags": "",
        }
        for i in range(len(chunks))
    ]
    ids = [f"rotl_{episode_number}_{str(i)}" for i in range(len(chunks))]

    collection.add(documents=chunks, metadatas=metadatas, ids=ids)

    results = get_entities(chunks, model)

    logging.info(
        f"rotl {episode_number} entities generated {(time.time() - start_episode):.1f}s"
    )
    for entity, data in results.items():
        start_entity = time.time()
        labels = data["labels"]
        indexes = data["indexes"]
        relevant_chunks = get_relevant_chunks(chunks, indexes)

        context = "\n".join(relevant_chunks)
        question = f"What do John and Merlin say about {entity}?"
        start_answer = time.time()
        answer = ask_llm(f"{context}\n\n{question}", model=LLM_MODEL, tokens=500)

        tags = get_tags(answer, model, stopwords=["john", "merlin"])

        doc = f"{entity}\n\n{', '.join(labels)}\n\n{', '.join(tags)}\n\n{answer}"

        id = f"{entity}_rotl_{episode_number}"
        metadata = {
            "chunks": ",".join([str(i) for i in indexes]),
            "show": "Roderick on the Line",
            "episode": episode_number,
            "title": episode_title,
            "subject": entity,
            "category": ",".join(labels),
            "tags": ",".join(tags),
        }

        collection.add(documents=[doc], ids=[id], metadatas=[metadata])
        logging.info(
            f"rotl {episode_number} {entity} {(time.time() - start_entity):.1f}s"
        )
    td = timedelta(seconds=time.time() - start_episode)
    formatted = f"{td.seconds // 60}min {td.seconds % 60}s"
    logging.info(f"rotl {episode_number} {episode_title} {formatted}")

rotl 000 entities generated 8.2s
rotl 000 AK-Forty-Sevens 3.9s
rotl 000 amazon.com 2.2s
rotl 000 America 3.1s
rotl 000 American 3.0s
rotl 000 Americans 2.4s
rotl 000 Arcade Fire 3.7s
rotl 000 Arkansas 1.8s
rotl 000 Arnold Palmer 2.1s
rotl 000 Arnold Palmer drink 3.5s
rotl 000 Arnold Palmer Light 5.0s
rotl 000 Arnold Palmers 3.9s
rotl 000 Artisanal 3.1s
rotl 000 Artisanal bread 2.2s
rotl 000 artisanal ones 2.3s
rotl 000 baby 4.7s
rotl 000 bachelor 2.3s
rotl 000 band 2.8s
rotl 000 bar 3.8s
rotl 000 barrow smith 2.7s
rotl 000 bars 3.0s
rotl 000 bartender 5.3s
rotl 000 battle 4.3s
rotl 000 beer 2.1s
rotl 000 Berlin 2.7s
rotl 000 Bible 1.9s
rotl 000 birthday party 2.1s
rotl 000 black light 2.4s
rotl 000 blend 3.9s
rotl 000 book 2.0s
rotl 000 burlap 2.4s
rotl 000 button-down shirt 3.6s
rotl 000 carbohydrates 3.1s
rotl 000 Carson 1.5s
rotl 000 cartoon character 2.2s
rotl 000 CFLs 3.0s
rotl 000 chamber 2.3s
rotl 000 cheese 2.4s
rotl 000 cherry juice 3.2s
rotl 000 chick magnet trucker cap 2.7s


In [None]:
query = ""
results = collection.query(query_texts=[query], n_results=10)

docs = results["documents"][0]