In [1]:
from datetime import timedelta
import json
from pathlib import Path
import logging
import time

stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)

logging.getLogger("httpx").setLevel(logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.FileHandler("logs.txt"), stream_handler],
)

import chromadb
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import (
    SentenceTransformerEmbeddingFunction,
)
from gliner import GLiNER
from pydantic import BaseModel

from chunking_utils import get_chunks
from llm_utils import ask_llm
from metadata_utils import get_meta
from nlp_utils import get_entities, get_tags, get_relevant_chunks
from transcript_utils import srt_to_text


class Entity(BaseModel):
    start: int
    end: int
    text: str
    label: str
    score: float


model = GLiNER.from_pretrained("urchade/gliner_base", max_length=768)

LLM_MODEL = "qwen2.5:14b"

Folder does not exist locally, attempting to use huggingface hub.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



In [None]:
chromadb_dir = Path("chromadb")
chromadb_dir.mkdir(exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(chromadb_dir))
collection = chroma_client.get_or_create_collection(name="roderick")

Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [27]:
def exists(id):
    results = collection.get(ids=[id])
    return True if results["documents"] and len(results["documents"]) else False


files = [
    file for file in sorted(Path("../files/rotl").iterdir()) if ".srt" in file.name
]

last_file = "106"
filtered_files = [file for file in files if float(get_meta(file)[1]) > float(last_file)]

for file in filtered_files:
    start_episode = time.time()

    file_name, episode_number, episode_date, episode_title = get_meta(file)
    transcript = srt_to_text(file)

    summary_instructions = f"Summarize this episode of Roderick on the Line"
    summary = ask_llm(
        f"{transcript}\n\n{summary_instructions}", model=LLM_MODEL, tokens=500
    )
    summary_metadata = {
        "chunks": "",
        "show": "Roderick on the Line",
        "episode": episode_number,
        "title": episode_title,
        "subject": f"Roderick on the Line, episode {episode_number}, {episode_title}",
        "category": "Summary",
        "tags": f"Roderick on the Line, episode {episode_number}, {episode_title}",
    }
    summary_id = f"rotl_{episode_number}_summary"
    summary_doc = f"Roderick on the Line\n\nEpisode {episode_number}\n\n{episode_title}\n\nSummary of Roderick on the Line, episode {episode_number}, {episode_title}, {episode_date}\n\n{summary}"

    collection.add(
        documents=[summary_doc], metadatas=[summary_metadata], ids=[summary_id]
    )

    chunks = get_chunks(transcript)
    metadatas = [
        {
            "chunks": str(i),
            "show": "Roderick on the Line",
            "episode": episode_number,
            "title": episode_title,
            "subject": "",
            "category": "",
            "tags": "",
        }
        for i in range(len(chunks))
    ]
    ids = [f"rotl_{episode_number}_{str(i)}" for i in range(len(chunks))]

    collection.add(documents=chunks, metadatas=metadatas, ids=ids)

    results = get_entities(chunks, model)

    logging.info(
        f"rotl {episode_number} entities generated {(time.time() - start_episode):.1f}s"
    )
    for entity, data in results.items():
        id = f"{entity}_rotl_{episode_number}"
        if not exists(id):
            start_entity = time.time()
            labels = data["labels"]
            indexes = data["indexes"]
            relevant_chunks = get_relevant_chunks(chunks, indexes)

            context = "\n".join(relevant_chunks)
            question = f"What do John and Merlin say about {entity}?"
            start_answer = time.time()
            answer = ask_llm(f"{context}\n\n{question}", model=LLM_MODEL, tokens=500)

            tags = get_tags(answer, model, stopwords=["john", "merlin"])

            doc = f"{entity}\n\n{', '.join(labels)}\n\n{', '.join(tags)}\n\n{answer}"

            metadata = {
                "chunks": ",".join([str(i) for i in indexes]),
                "show": "Roderick on the Line",
                "episode": episode_number,
                "title": episode_title,
                "subject": entity,
                "category": ",".join(labels),
                "tags": ",".join(tags),
            }

            collection.add(documents=[doc], ids=[id], metadatas=[metadata])
            logging.info(
                f"rotl {episode_number} {entity} {(time.time() - start_entity):.1f}s"
            )
    td = timedelta(seconds=time.time() - start_episode)
    formatted = f"{td.seconds // 60}min {td.seconds % 60}s"
    logging.info(f"rotl {episode_number} {episode_title} {formatted}")

Add of existing embedding ID: rotl_107_summary
Insert of existing embedding ID: rotl_107_summary
Add of existing embedding ID: rotl_107_0
Add of existing embedding ID: rotl_107_1
Add of existing embedding ID: rotl_107_2
Add of existing embedding ID: rotl_107_3
Add of existing embedding ID: rotl_107_4
Add of existing embedding ID: rotl_107_5
Add of existing embedding ID: rotl_107_6
Add of existing embedding ID: rotl_107_7
Add of existing embedding ID: rotl_107_8
Add of existing embedding ID: rotl_107_9
Add of existing embedding ID: rotl_107_10
Add of existing embedding ID: rotl_107_11
Add of existing embedding ID: rotl_107_12
Add of existing embedding ID: rotl_107_13
Add of existing embedding ID: rotl_107_14
Add of existing embedding ID: rotl_107_15
Add of existing embedding ID: rotl_107_16
Add of existing embedding ID: rotl_107_17
Add of existing embedding ID: rotl_107_18
Add of existing embedding ID: rotl_107_19
Add of existing embedding ID: rotl_107_20
Add of existing embedding ID: r

In [38]:
query = "What is John's daughter's name?"
results = collection.query(query_texts=[query], n_results=20)

docs = results["documents"][0]

context = "\n".join(docs)

answer = ask_llm(f"{context}\n\n{query}", model=LLM_MODEL, tokens=500)

print(answer)

with open("docs.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(docs))

From the context provided, it seems that John and Merlin have agreed not to use specific names for their children out of respect for privacy. Therefore, John’s daughter's name has not been mentioned directly in the conversation. They refer to her simply as "my daughter" or similar terms to protect her identity.
