In [1]:
import sys
import os

import spacy
from spacy.tokens import Doc
from spacy import displacy

import re 
import pandas as pd

In [2]:
# pip install -U pip setuptools wheel
# !python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
                                              0.0/12.8 MB ? eta -:--:--
                                              0.1/12.8 MB 3.6 MB/s eta 0:00:04
     -                                        0.4/12.8 MB 5.5 MB/s eta 0:00:03
     --                                       0.9/12.8 MB 7.1 MB/s eta 0:00:02
     ----                                     1.4/12.8 MB 8.3 MB/s eta 0:00:02
     ------                                   2.0/12.8 MB 8.9 MB/s eta 0:00:02
     -------                                  2.5/12.8 MB 9.4 MB/s eta 0:00:02
     ---------                                3.0/12.8 MB 9.7 MB/s eta 0:00:02
     -----------                              3.6/12.8 MB 9.9 MB/s eta 0:00:01
     ------------                             4.1/12.8 MB 10.5 MB/s eta 0:00:01
     --------------                     

In [2]:
# appending a path
sys.path.append('lib/utils/')
 
# importing required module
from functions import ner, get_ne_list_per_sentence, filter_entity, create_relationships

## Load Books

In [3]:

# Get all book files in the data directory
all_books = [b for b in os.scandir('books') if b.name.endswith('.pdf')]

# Sort dir entries by name
all_books.sort(key=lambda x: x.name)
all_books

[<DirEntry '0 - Hunger Games.pdf'>,
 <DirEntry '1 - Catching Fire.pdf'>,
 <DirEntry '2 - Mocking Jay.pdf'>]

In [4]:
# Load the Spacy model
nlp = spacy.load("en_core_web_sm")

# Process the selected books
book_docs = []
for book in all_books:
    print(book.path)
    # Specify the starting page
    if book.name == '0 - Hunger Games.pdf' or book.name == '1 - Catching Fire.pdf':
        start_page = 3  # Hunger Games, Catching Fire
    else:
        start_page = 12  # Mockingjay
    
    book_doc = ner(book.path, start_page, nlp)
    book_docs.append(book_doc)

# Combine the individual book documents into a single document
combined_doc = Doc.from_docs(book_docs)

books\0 - Hunger Games.pdf
books\1 - Catching Fire.pdf
books\2 - Mocking Jay.pdf


In [7]:
# Visualize identified entities
displacy.render(combined_doc[0:2000], style="ent", jupyter=True)

## Load Characters

In [13]:
# Read characters
character_df = pd.read_csv("characters.csv")

# Remove brackets and text within brackets
character_df['character'] = character_df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) 
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ', 1)[0])

pd.set_option('display.max_rows', None)
character_df

Unnamed: 0,book,character,character_firstname
0,Category:Catching Fire characters,Haymitch Abernathy,Haymitch
1,Category:Catching Fire characters,Atala,Atala
2,Category:Catching Fire characters,Beetee Latier,Beetee
3,Category:Catching Fire characters,Blight,Blight
4,Category:Catching Fire characters,Bonnie,Bonnie
5,Category:Catching Fire characters,Bristel,Bristel
6,Category:Catching Fire characters,Brutus,Brutus
7,Category:Catching Fire characters,Buttercup,Buttercup
8,Category:Catching Fire characters,Cashmere,Cashmere
9,Category:Catching Fire characters,Cecelia,Cecelia


## Get named entity list per sentence


In [8]:
sent_entity_df = get_ne_list_per_sentence(combined_doc)
sent_entity_df

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\camil\Documents\github\0the-hunger-games-network\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\camil\AppData\Local\Temp\ipykernel_21928\2356371432.py", line 1, in <module>
    sent_entity_df = get_ne_list_per_sentence(combined_doc)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\camil\Documents\github\0the-hunger-games-network\lib/utils\functions.py", line 48, in get_ne_list_per_sentence
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\camil\Documents\github\0the-hunger-games-network\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 2105, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\camil\Documents\github\0the-

In [11]:

sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in combined_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})

sent_entity_df = pd.DataFrame(sent_entity_df)

In [10]:
len(combined_doc.ents)

12403

## Filter entities

In [15]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered.head(10)

Unnamed: 0,sentence,entities,character_entities
