In [50]:
import nltk
import pandas as pd
import requests

from string import punctuation
from typing import Any, Tuple
from nltk.tokenize import casual_tokenize, sent_tokenize

In [62]:
CHR_SPACE: str = " "
CHR_APOST: str = "â\x80\x99"  # '
CHR_SDQUOT: str = "â\x80\x9c" # "
CHR_DDQUOT: str = "â\x80\x9d" # ""
CHR_MISC: str = "â\x80\x94"   # Not sure what this is but it gets replaced by a space

def get_data(url: str) -> str:
    text_output: str = requests.get(url=url).text
    return text_output

def clean_data(text_input: str) -> str:
    index_start: int = text_input.index("One morning")
    index_end: int = text_input.rindex("*** END OF THE PROJECT GUTENBERG EBOOK METAMORPHOSIS ***")
    text_output: str = (
        text_input[index_start:index_end]
        .replace("\r", CHR_SPACE)
        .replace("\n", CHR_SPACE)
        .replace(CHR_APOST, "'")
        .replace(CHR_SDQUOT, '"') # lol
        .replace(CHR_DDQUOT, '"') # The book replaces this char with "" so that thoughts and dialog plus "he said" etc. get captured as one sentence
        .replace(CHR_MISC, CHR_SPACE)
    )

    return text_output

def remove_char_from_entity(text_entity: str, char_to_remove: str) -> str:
    if char_to_remove in text_entity:
        start_index: int = text_entity.index(char_to_remove)
        text_entity_cleaned: str = text_entity[:start_index]
        return text_entity_cleaned

    return text_entity

def remove_punctuation(text_entity: str) -> str:
    text_output: str = text_entity

    for next_punc in punctuation:
        text_output = remove_char_from_entity(text_entity=text_output, char_to_remove=next_punc)

    return text_output

def clean_entity(text_entity: str) -> str:
    text_output: str = remove_char_from_entity(text_entity=text_entity, char_to_remove="'")
    text_output = remove_punctuation(text_entity=text_output)

    return text_output

def extract_entities(text_input: str, desired_tag: str) -> list[str]:
    tokens = casual_tokenize(text=text_input)

    # nltk.pos_tag gives pairs of token (0) + tag (1)
    # Filtered on desired_tag
    tags_output: list[str] = [clean_entity(row[0]) for row in nltk.pos_tag(tokens) if row[1] == desired_tag]
    tags_output = [row for row in tags_output if len(row) > 1]

    return tags_output

def get_book_entities(url_book: str) -> list[str]:
    # Step 1: Get book data
    text_book: str = get_data(url=url_book)
    text_cleaned: str = clean_data(text_input=text_book)

    # Step 2: Get tags
    sentences: list[str] = sent_tokenize(text=text_cleaned)

    # Step 3: Get NNP tagged entities only, with punctuation etc. removed
    entities: list[list[str]] = [extract_entities(text_input=next_sentence, desired_tag="NNP") for next_sentence in sentences]
    entities_cleaned: list[list[str]] = [x if len(x) > 0 else None for x in entities]    

    # Step 4: Compile to dataframe
    df_entities: pd.DataFrame = (
        pd.DataFrame(dict(sentence=sentences, entities=entities_cleaned))
        .dropna()
    )
    df_entities = (
        df_entities[df_entities.entities.apply(len) > 1]
    )

    # Step 5: Just get a list
    list_entities: list[str] = df_entities.entities.to_list()
    
    return list_entities

In [27]:
url_metamorphosis: str = "https://www.gutenberg.org/files/5200/5200-0.txt"
text_metamorphosis: str = get_data(url=url_metamorphosis)

In [28]:
text_cleaned: str = clean_data(text_input=text_metamorphosis)
text_cleaned



In [29]:
# nltk processing
sentences: list[str] = sent_tokenize(text_cleaned)
sentences[0:10]

['One morning, when Gregor Samsa woke from troubled dreams, he found  himself transformed in his bed into a horrible vermin.',
 'He lay on his  armour-like back, and if he lifted his head a little he could see his  brown belly, slightly domed and divided by arches into stiff sections.',
 'The bedding was hardly able to cover it and seemed ready to slide off  any moment.',
 'His many legs, pitifully thin compared with the size of the  rest of him, waved about helplessly as he looked.',
 '"What\'s happened to me?"',
 'he thought.',
 "It wasn't a dream.",
 'His room, a  proper human room although a little too small, lay peacefully between  its four familiar walls.',
 'A collection of textile samples lay spread out  on the table Samsa was a travelling salesman and above it there hung a  picture that he had recently cut out of an illustrated magazine and  housed in a nice, gilded frame.',
 'It showed a lady fitted out with a fur  hat and fur boa who sat upright, raising a heavy fur muff tha

In [30]:
sentences_tagged: list[Tuple[Any, str]] = nltk.pos_tag(tokens=sentences)
sentences_tagged

[('One morning, when Gregor Samsa woke from troubled dreams, he found  himself transformed in his bed into a horrible vermin.',
  'NNP'),
 ('He lay on his  armour-like back, and if he lifted his head a little he could see his  brown belly, slightly domed and divided by arches into stiff sections.',
  'NNP'),
 ('The bedding was hardly able to cover it and seemed ready to slide off  any moment.',
  'NNP'),
 ('His many legs, pitifully thin compared with the size of the  rest of him, waved about helplessly as he looked.',
  'NNP'),
 ('"What\'s happened to me?"', 'NNP'),
 ('he thought.', 'NN'),
 ("It wasn't a dream.", 'NNP'),
 ('His room, a  proper human room although a little too small, lay peacefully between  its four familiar walls.',
  'NNP'),
 ('A collection of textile samples lay spread out  on the table Samsa was a travelling salesman and above it there hung a  picture that he had recently cut out of an illustrated magazine and  housed in a nice, gilded frame.',
  'NNP'),
 ('It showe

In [32]:
# The above isn't actually the desired output - need to use casual_tokenize on individual sentences.
test_sentence: str = sentences[0]
extract_entities(text_input=test_sentence, desired_tag="NNP")

['Gregor', 'Samsa']

In [60]:
# Do this for the whole text
entities: list[list[str]] = [extract_entities(text_input=next_sentence, desired_tag="NNP") for next_sentence in sentences]
entities_cleaned: list[list[str]] = [x if len(x) > 0 else None for x in entities]
entities_cleaned

[['Gregor', 'Samsa'],
 None,
 None,
 None,
 ['What'],
 None,
 None,
 None,
 ['Samsa'],
 None,
 ['Gregor'],
 None,
 None,
 None,
 None,
 ['Oh', 'God'],
 None,
 None,
 ['Hell'],
 None,
 None,
 None,
 ['You'],
 None,
 None,
 None,
 None,
 None,
 ['He'],
 None,
 None,
 None,
 ['First'],
 None,
 ['God', 'Heaven'],
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ['Gregor'],
 None,
 None,
 ['Gregor'],
 None,
 ['Gregor'],
 None,
 None,
 ['Gregor'],
 ['Didn'],
 None,
 ['Gregor'],
 None,
 ['Gregor'],
 ['Gregor'],
 ['Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor'],
 None,
 ['Gregor'],
 ['Aren'],
 None,
 ['Gregor'],
 ['Gregor'],
 ['Gregor'],
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ['Gregor'],
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ['Seven'],
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ['Gregor'],
 None,
 None,
 None,
 None,
 None,
 None,
 ['That'],
 None,
 ['They', 'Gregor'],
 None,
 ['Gregor'],
 ['Gregor'],
 None,
 None,
 None,
 None,
 

In [61]:
# Now compile to dataframe
# TODO: Look into a way to do this in one statement
df_entities: pd.DataFrame = (
    pd.DataFrame(dict(sentence=sentences, entities=entities_cleaned))
    .dropna()
)
df_entities = (
    df_entities[df_entities.entities.apply(len) > 1]
)

list_entities: list[str] = df_entities.entities.to_list()
list_entities

[['Gregor', 'Samsa'],
 ['Oh', 'God'],
 ['God', 'Heaven'],
 ['Gregor', 'Gregor'],
 ['They', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Yes', 'Gregor'],
 ['Mr', 'Samsa'],
 ['He', 'Gregor'],
 ['Well', 'Mrs', 'Samsa'],
 ['No', 'Gregor'],
 ['Mr', 'Samsa'],
 ['Mr', 'Samsa'],
 ['Sir', 'Gregor'],
 ['Oh', 'God'],
 ['Gregor', 'Quick'],
 ['Did', 'Gregor'],
 ['Gregor', 'Well', 'Gregor'],
 ['Help', 'Help'],
 ['Mother', 'Gregor'],
 ['Gregor', 'Gregor', 'Gregor'],
 ['Gregor', 'Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Christmas'],
 ['Gregor', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Grete'],
 ['Grete', 'Gregor', 'Oh', 'God', 'God'],
 ['Gregor', 'Gregor', 'Grete'],
 ['Gregor', 'Sunday', 'Gregor'],
 ['III', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor', 'Grete'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor', 'Gregor', 'Gregor', 'Gregor'],
 ['Grego

In [63]:
get_book_entities(url_book=url_metamorphosis)

[['Gregor', 'Samsa'],
 ['Oh', 'God'],
 ['God', 'Heaven'],
 ['Gregor', 'Gregor'],
 ['They', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Yes', 'Gregor'],
 ['Mr', 'Samsa'],
 ['He', 'Gregor'],
 ['Well', 'Mrs', 'Samsa'],
 ['No', 'Gregor'],
 ['Mr', 'Samsa'],
 ['Mr', 'Samsa'],
 ['Sir', 'Gregor'],
 ['Oh', 'God'],
 ['Gregor', 'Quick'],
 ['Did', 'Gregor'],
 ['Gregor', 'Well', 'Gregor'],
 ['Help', 'Help'],
 ['Mother', 'Gregor'],
 ['Gregor', 'Gregor', 'Gregor'],
 ['Gregor', 'Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Christmas'],
 ['Gregor', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Grete', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Grete'],
 ['Grete', 'Gregor', 'Oh', 'God', 'God'],
 ['Gregor', 'Gregor', 'Grete'],
 ['Gregor', 'Sunday', 'Gregor'],
 ['III', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor', 'Grete'],
 ['Gregor', 'Gregor'],
 ['Gregor', 'Gregor', 'Gregor', 'Gregor', 'Gregor'],
 ['Grego