In [None]:
import re
import os
from os import linesep, path, DirEntry

from IPython.display import display

import pandas as pd
import spacy as sp
from pandas import DataFrame
from spacy.tokens import Doc

In [None]:
!ipython -m spacy download en_core_web_trf

In [None]:
nlp = sp.load("en_core_web_trf")

In [None]:
chars_df = pd.read_csv(filepath_or_buffer=path.join("csv", "characters.csv"), sep=";", header=0,
                       index_col=0, encoding="utf-8")
chars_df

In [None]:
def trim_content(entry: DirEntry) -> str:
    s = open(entry).read()
    return linesep.join([s for s in s.splitlines() if s.strip()])


def entries_to_dicts() -> list[dict[str, str | Doc]]:
    b: DirEntry
    entries = [b for b in os.scandir("texts") if b.name.endswith(".txt")]
    return [{"File": b.name,
             "Book": re.sub(r"(?:\d|[A-Z])\s-\s", "", b.name).replace(".txt", ""),
             "Doc": nlp(trim_content(b))} for b in entries]


book_entries = entries_to_dicts()
%store book_entries

In [None]:
def contents_to_dicts(_dicts: list[dict[str, str | Doc]]) -> list[dict[str, str]]:
    book: str
    doc: Doc
    for book, doc in zip([d["Book"] for d in book_entries], [d["Doc"] for d in book_entries]):
        for sentence in doc.sents:
            entities = [entity.text for entity in sentence.ents]
            yield dict(Book=book, Sentence=sentence.as_doc().text, Entities=entities)

In [None]:
def filter_entities(entities: list[str], *args: DataFrame) -> list[str]:
    return [ent for ent in entities if ent in set().union(*args)]


roman_numeral = "M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"

sents_df = DataFrame.from_records(contents_to_dicts(book_entries))
sents_df["CharacterEntities"] = sents_df["Entities"] \
    .apply(lambda l: filter_entities(l, chars_df["FirstName"], chars_df["FullName"])) \
    .apply(lambda l: [re.sub(f"\s(?!{roman_numeral}$).*", "", s) for s in l])
sents_df["Sentence"] = sents_df["Sentence"] \
    .apply(lambda x: x.replace("\r\n", "")) \
    .apply(lambda x: x.strip())
filtered_sents_df = sents_df[sents_df["CharacterEntities"].map(len) > 0]
%store sents_df
%store filtered_sents_df

In [None]:
%store -r sents_df
%store -r filtered_sents_df
display(sents_df, filtered_sents_df)