In [1]:
from pathlib import Path
import re
import sys

from openai import OpenAI

parent_dir = str(Path().resolve().parents[0])
sys.path.insert(0, parent_dir)

from utils import get_meta, chunker, num_tokens
from app.utils import srt_to_lines, srt_to_text


def tokenize_text(text):
    """Tokenizes the input text into a list of tokens."""
    return re.findall(r"\w+(?:[-_]\w+)*|\S", text)


BASE_URL = "https://mlkyway.anselbrandt.net/vllm/v1"
model = "Qwen/Qwen2.5-3B-Instruct"

openai_api_key = "EMPTY"
openai_api_base = BASE_URL

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)


def ask_llm(context, model=model):
    data = {
        "model": model,
        "max_tokens": 1000,
        "temperature": 0,
        "messages": [
            {"role": "user", "content": context},
        ],
    }
    response = client.chat.completions.create(**data)
    chat_response = response.choices[0].message.content
    return chat_response

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [2]:
from gliner import GLiNER

gliner_model = GLiNER.from_pretrained("urchade/gliner_base")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
dir = Path("../files/bulk/rotl")
files = [file for file in dir.iterdir() if ".srt" in file.name]

out_dir = Path() / "ariella"
out_dir.mkdir(exist_ok=True)

ariella = []

for file in files:
    text = srt_to_text(file)
    if "Ariella" in text:
        out_path = out_dir / file.name
        with open(out_path, "w") as f:
            f.write(text)

In [6]:
dir = Path("ariella")
files = [file for file in dir.iterdir()]

out_dir = Path() / "names"
out_dir.mkdir(exist_ok=True)

for file in files:
    file_name, episode_number, episode_date, episode_title = get_meta(file)
    text = open(file, "r").read().splitlines()
    names = set()
    for line in text:
        speaker, text = line.split(": ")
        entities = gliner_model.predict_entities(text, ["Person"], threshold=0.5)
        for entity in entities:
            names.add(entity["text"])
    out_path = out_dir / file.name
    with open(out_path, "w") as f:
        f.write("\n".join(sorted(list(names))))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
