In [15]:
import requests

import spacy
from spacy import Language

In [16]:
nlp: Language = spacy.load("en_core_web_md")

In [30]:
CHR_SPACE: str = " "
CHR_APOST: str = "â\x80\x99"  # '
CHR_SDQUOT: str = "â\x80\x9c" # "
CHR_DDQUOT: str = "â\x80\x9d" # ""
CHR_MISC: str = "â\x80\x94"   # Not sure what this is but it gets replaced by a space

def get_data(url: str) -> str:
    text_output: str = requests.get(url=url).text
    return text_output

def clean_data(text_input: str, start_text: str, end_text: str="THE END") -> str:
    index_start: int = text_input.index(start_text)
    index_end: int = text_input.rindex(end_text)
    text_output: str = (
        text_input[index_start:index_end]
        .replace("\r", CHR_SPACE)
        .replace("\n", CHR_SPACE)
        .replace(CHR_APOST, "'")
        .replace(CHR_SDQUOT, '"') # lol
        .replace(CHR_DDQUOT, '"') # The book replaces this char with "" so that thoughts and dialog plus "he said" etc. get captured as one sentence
        .replace(CHR_MISC, CHR_SPACE)
    )

    return text_output

def remove_char_from_entity(text_entity: str, char_to_remove: str) -> str:
    if char_to_remove in text_entity:
        start_index: int = text_entity.index(char_to_remove)
        text_entity_cleaned: str = text_entity[:start_index]
        return text_entity_cleaned

    return text_entity

def clean_entity(text_entity: str) -> str:
    text_output: str = text_entity.strip()
    text_output = remove_char_from_entity(text_entity=text_output, char_to_remove="'s")

    return text_output

def extract_entities_from_sentence(sentence: spacy.tokens.span.Span, desired_tags: list[str]) -> list[str]:
    sentence_doc = nlp(sentence.text)

    entities: list[str] = [clean_entity(next_entity.text) for next_entity in sentence_doc.ents if next_entity.label_ in desired_tags]
    entities = list(filter(lambda x: x != "", entities))

    return list(set(entities))

def extract_entities(sentences_input: list[str], desired_tags: list[str]) -> list[str]:
    # Filtered on desired_tag
    
    entities: list[str] = [extract_entities_from_sentence(sentence=next_sentence, desired_tags=desired_tags) for next_sentence in sentences_input]
    
    entities = list(filter(lambda x: len(x) > 1, entities))

    return entities

def get_book_entities(url_book: str, start_text: str, end_text: str="THE END") -> list[str]:
    # Step 1: Get book data
    text_book: str = get_data(url=url_book)
    text_cleaned: str = clean_data(text_input=text_book, start_text=start_text, end_text=end_text)

    doc = nlp(text=text_cleaned)

    # Step 2: Get tags
    sentences: list[str] = list(doc.sents)

    entities: list[list[str]] = extract_entities(sentences_input=sentences, desired_tags=["PERSON", "ORG", "GPE"])
    
    return entities

In [18]:
url_metamorphosis: str = "https://www.gutenberg.org/files/5200/5200-0.txt"
entities_metamorphosis: list[list[str]] = get_book_entities(url_book=url_metamorphosis, start_text="One moring", end_text="*** END OF THE PROJECT GUTENBERG EBOOK METAMORPHOSIS ***")

In [19]:
entities_metamorphosis

[['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['fro', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Gregor', 'jaws'],
 ['Gregor', 'Samsa'],
 ['Gregor', 'Samsa'],
 ['Grete', 'Samsa'],
 ['Grete', 'Samsa'],
 ['Grete', 'Samsa']]

In [35]:
def extract_entities_dk(text: str):
    doc = nlp(text)
    sentences = list(doc.sents)
    entities = []
    for sentence in sentences:
        sentence_entities = []
        sent_doc = nlp(sentence.text)
        for ent in sent_doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'GPE']:
                entity = ent.text.strip()
                if "'s" in entity:
                    cutoff = entity.index("'s")
                    entity = entity[:cutoff]
                if entity != '':
                    sentence_entities.append(entity)
        sentence_entities = list(set(sentence_entities))
        if len(sentence_entities) > 1:
            entities.append(sentence_entities)
    return entities

def get_book_entities_dk(url_book: str, start_text: str, end_text: str) -> list[str]:
    # Step 1: Get book data
    text_book: str = get_data(url=url_book)
    text_cleaned: str = clean_data(text_input=text_book, start_text=start_text, end_text=end_text)

    entities: list[list[str]] = extract_entities_dk(text=text_cleaned)
    
    return entities

In [36]:
# Sanity check against the book - the book lists different output to what the copy-pasted code does, but same as the code written above
# This is likely a difference in spaCy lib/model versions.
entities_dk: list[list[str]] = get_book_entities_dk(url_book=url_metamorphosis, start_text="One morning", end_text="*** END OF THE PROJECT GUTENBERG EBOOK METAMORPHOSIS ***")

In [37]:
entities_dk

[['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['fro', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Gregor', 'jaws'],
 ['Gregor', 'Samsa'],
 ['Gregor', 'Samsa'],
 ['Grete', 'Samsa'],
 ['Grete', 'Samsa'],
 ['Grete', 'Samsa']]

In [33]:
def test_ajp_vs_dk_entities(ent_ajp: list[list[str]], ent_dk: list[list[str]]):
    overall_match: bool = True

    assert len(ent_ajp) == len(ent_dk)
    for next_index in range(len(ent_ajp)):
        print(f"{ent_ajp[next_index]=}")
        print(f"{ent_dk[next_index]=}")
        current_match = ent_dk[next_index] == ent_ajp[next_index]
        print(f"Match: {current_match}")
        overall_match = overall_match and current_match

    print(f"Overall match: {overall_match}")

In [38]:
test_ajp_vs_dk_entities(entities_metamorphosis, entities_dk)

ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['fro', 'Gregor']
ent_dk[next_index]=['fro', 'Gregor']
Match: True
ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['Grete', 'Gregor']
ent_dk[next_index]=['Grete', 'Gregor']
Match: True
ent_ajp[next_index]=['Gregor', 'jaws']
ent_dk[next_index]=['Gregor', 'jaws']
Match: True
ent_ajp[next_index]=['Gregor', 'Samsa']
ent_dk[next_index]=['Gregor', 'Samsa']
Match: True
ent_ajp[next_index]=['Gregor', 'Samsa']
ent_dk[next_index]=['Gregor', 'Samsa']
Match: True
ent_a

In [24]:
url_alice: str = "https://www.gutenberg.org/files/11/11-0.txt"

In [32]:
entities_alice: list[list[str]] = get_book_entities(url_book=url_alice, start_text="Alice was beginning", end_text="THE END")
entities_alice

[['New Zealand', 'Australia'],
 ['Hearthrug', 'Alice'],
 ['Ada', 'Mabel'],
 ['Rome', 'Paris', 'London'],
 ['Alice', 'Mabel'],
 ['William the Conqueror', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Northumbria', 'Mercia', 'Edwin', 'Morcar'],
 ['Morcar', 'Mercia', 'Canterbury', 'Northumbria', 'Stigand'],
 ['Mouse', 'â\x80\x98it'],
 ['Mouse', 'Edgar', 'William'],
 ['Normans', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Fury', 'â\x80\x98Let'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Dinah', 'Alice'],
 ['Mary Ann', 'Alice'],
 ['Bill', 'Alice'],
 ['Bill', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Caterpillar', 'William_'],
 ['William', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Alice', 'Pigeon'],
 ['Fish-Footman', 'Alice'],
 ['Alice', 'Cheshire'],
 ['CHORUS', 'Alice'],
 ['Duchess', 'Alice'],
 ['Hare', 'Hatter'],
 ['Hare', 'Alice'],
 ['Hare', 'Hatter', 'Dormouse'],
 ['Alice', 'Dormouse'],

In [39]:
entities_alice_dk: list[list[str]] = get_book_entities_dk(url_book=url_alice, start_text="Alice was beginning", end_text="THE END")
entities_alice_dk

[['New Zealand', 'Australia'],
 ['Hearthrug', 'Alice'],
 ['Ada', 'Mabel'],
 ['Rome', 'Paris', 'London'],
 ['Alice', 'Mabel'],
 ['William the Conqueror', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Northumbria', 'Mercia', 'Edwin', 'Morcar'],
 ['Morcar', 'Mercia', 'Canterbury', 'Northumbria', 'Stigand'],
 ['Mouse', 'â\x80\x98it'],
 ['Mouse', 'Edgar', 'William'],
 ['Normans', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Fury', 'â\x80\x98Let'],
 ['Mouse', 'Alice'],
 ['Mouse', 'Alice'],
 ['Dinah', 'Alice'],
 ['Mary Ann', 'Alice'],
 ['Bill', 'Alice'],
 ['Bill', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Caterpillar', 'William_'],
 ['William', 'Alice'],
 ['Caterpillar', 'Alice'],
 ['Alice', 'Pigeon'],
 ['Fish-Footman', 'Alice'],
 ['Alice', 'Cheshire'],
 ['CHORUS', 'Alice'],
 ['Duchess', 'Alice'],
 ['Hare', 'Hatter'],
 ['Hare', 'Alice'],
 ['Hare', 'Hatter', 'Dormouse'],
 ['Alice', 'Dormouse'],

In [40]:
test_ajp_vs_dk_entities(entities_alice, entities_alice_dk)

ent_ajp[next_index]=['New Zealand', 'Australia']
ent_dk[next_index]=['New Zealand', 'Australia']
Match: True
ent_ajp[next_index]=['Hearthrug', 'Alice']
ent_dk[next_index]=['Hearthrug', 'Alice']
Match: True
ent_ajp[next_index]=['Ada', 'Mabel']
ent_dk[next_index]=['Ada', 'Mabel']
Match: True
ent_ajp[next_index]=['Rome', 'Paris', 'London']
ent_dk[next_index]=['Rome', 'Paris', 'London']
Match: True
ent_ajp[next_index]=['Alice', 'Mabel']
ent_dk[next_index]=['Alice', 'Mabel']
Match: True
ent_ajp[next_index]=['William the Conqueror', 'Alice']
ent_dk[next_index]=['William the Conqueror', 'Alice']
Match: True
ent_ajp[next_index]=['Mouse', 'Alice']
ent_dk[next_index]=['Mouse', 'Alice']
Match: True
ent_ajp[next_index]=['Mouse', 'Alice']
ent_dk[next_index]=['Mouse', 'Alice']
Match: True
ent_ajp[next_index]=['Mouse', 'Alice']
ent_dk[next_index]=['Mouse', 'Alice']
Match: True
ent_ajp[next_index]=['Northumbria', 'Mercia', 'Edwin', 'Morcar']
ent_dk[next_index]=['Northumbria', 'Mercia', 'Edwin', 'Morca