In [1]:
from pathlib import Path
import sys
import re

from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")
from ollama import Client
import tiktoken
from chonkie import SDPMChunker

parent_dir = str(Path().resolve().parents[0])
sys.path.insert(0, parent_dir)

from app.utils import ShowMetadataList, ShowMetadata, metadata_to_dict

data = ShowMetadataList(
    shows=[
        ShowMetadata(
            "rotl",
            Path("../files/meta/rotl_dates.txt"),
            Path("../files/meta/rotl_titles.txt"),
        ),
        ShowMetadata(
            "roadwork",
            Path("../files/meta/roadwork_dates.txt"),
            Path("../files/meta/roadwork_titles.txt"),
        ),
    ]
)

dates_titles = metadata_to_dict(data)

dir = Path("../files/ariella/")
files = [file for file in dir.iterdir()]

out_dir = Path() / "output"
out_dir.mkdir(exist_ok=True)


def get_entities(transcript):
    names = set()
    for line in transcript.splitlines():
        speaker, text = line.split(": ")
        entities = model.predict_entities(text, ["Person"], threshold=0.5)
        for entity in entities:
            names.add(entity["text"])
    return sorted(list(names))


client = Client(host="https://mlkyway.anselbrandt.net/ollama")


def ask_llm(context):
    response = client.chat(
        model="gemma2:27b",
        messages=[
            {
                "role": "user",
                "content": context,
            },
        ],
    )
    content = response["message"]["content"]
    return content


def clean(text):
    chunks = text.split("\n\n")
    response = chunks[1]
    return re.sub(r"\*", "", response)


def get_names(text):
    names = clean(text)
    return names.splitlines()


chunker = SDPMChunker(
    embedding_model="minishlab/potion-base-8M",
    threshold=0.5,
    chunk_size=512,
    min_sentences=1,
    skip_window=1,
    delim="\n",
)


def num_tokens(string: str) -> int:
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [None]:
# Chunk

for file in files[:1]:
    file_name = file.name
    episode = file_name.split("_-_")[0]
    date = dates_titles["rotl"]["dates"][episode]
    title = dates_titles["rotl"]["titles"][episode]
    transcript = open(file, "r").read()
    chunks = chunker(transcript)
    chunk_text = [chunk.text for chunk in chunks]
    with open(out_dir / "chunks.txt", "w") as f:
        f.write("\n".join(chunk_text))

In [None]:
# Extract names

for file in files[:1]:
    file_name = file.name
    episode = file_name.split("_-_")[0]
    date = dates_titles["rotl"]["dates"][episode]
    title = dates_titles["rotl"]["titles"][episode]
    transcript = open(file, "r").read()
    chunks = open("output/chunks.txt", "r").read().split("\n\n")
    named_entities = get_entities(transcript)
    entities_query = "Which of these are people's names? Only return results if they are people's names."
    entities_context = f"{entities_query}\n\n{named_entities}"
    entities_response = ask_llm(entities_context)
    names = get_names(entities_response)
    with open(out_dir / "names.txt", "w") as f:
        f.write("\n".join(names))

In [6]:
def clean_name_response(text):
    lines = re.sub(r"\n+", "\n", text).splitlines()
    clean_lines = [
        line
        for line in lines
        if "Let me know if you have any other questions" not in line
    ]
    return "\n".join(clean_lines)

In [7]:
# Find Chunk

name_dir = Path() / "names"
name_dir.mkdir(exist_ok=True)

for file in files[:1]:
    file_name = file.name
    episode = file_name.split("_-_")[0]
    date = dates_titles["rotl"]["dates"][episode]
    title = dates_titles["rotl"]["titles"][episode]
    transcript = open(file, "r").read()
    chunks = open("output/chunks.txt", "r").read().split("\n\n")
    names = open("output/names.txt", "r").read().splitlines()
    for name in names:
        relevant_chunks = []
        for chunk in chunks:
            if name in chunk:
                relevant_chunks.append(chunk)
        matching_chunks = "\n".join(relevant_chunks)
        query = f"Who is {name}?"
        context = f"{matching_chunks}\n\n{query}"
        name_response = ask_llm(context)
        response = clean_name_response(name_response)
        with open(name_dir / f"{name}.txt", "w") as f:
            f.write(response)