In [None]:
!pip install EbookLib

Collecting EbookLib
  Downloading EbookLib-0.18.tar.gz (115 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: EbookLib
  Building wheel for EbookLib (setup.py) ... [?25l[?25hdone
  Created wheel for EbookLib: filename=EbookLib-0.18-py3-none-any.whl size=38778 sha256=c2188f09b63a0dac6b42b3aca872b0eaf5c707c9f3e05c72a6a6a4bfaff31b24
  Stored in directory: /root/.cache/pip/wheels/0f/38/cc/a3728bb72a315d9d8766fb71d362136372066fc25ad838f8fa
Successfully built EbookLib
Installing collected packages: EbookLib
Successfully installed EbookLib-0.18


In [None]:
import urllib.request
import ebooklib
from ebooklib import epub

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path to your EPUB file in Google Drive
#epub_path = '/content/drive/My Drive/TORI_dataset/A_roomtour.epub'
#epub_path = '/content/drive/My Drive/TORI_dataset/alice_wonderland.epub'
#epub_path = '/content/drive/My Drive/TORI_dataset/enchanted_april.epub'
#epub_path = '/content/drive/My Drive/TORI_dataset/Gatsby.epub'
epub_path = '/content/drive/My Drive/TORI_dataset/blue_castle.epub'
#epub_path = '/content/drive/My Drive/TORI_dataset/little_woman.epub'

# Read the EPUB file
book = epub.read_epub(epub_path)

In [None]:
def is_valid(epub_book):
    """Validates the structure of an EPUB book."""
    if not epub_book.toc:
        print('Table of content is missing')
        return False
    if not epub_book.spine:
        print('Spine is missing')
        return False

    # Check for stylesheets
    stylesheets = [item for item in epub_book.get_items_of_type(ebooklib.ITEM_STYLE)]
    if len(stylesheets) == 0:
        print('No stylesheets')
        return False

    # Ensure chapters are not empty
    for item in epub_book.spine:
        if isinstance(item, epub.EpubHtml):
            if not item.content.strip():
                print(f"Invalid book: Chapter '{item.get_id()}' is empty.")
                return False

    return True

In [None]:
if is_valid(book):
    print("The EPUB book is valid.")
else:
    print("The EPUB book is not valid.")

The EPUB book is valid.


In [None]:
def chapter_to_str(chapter):
    """Converts a chapter object into a string, filtering out non-textual elements."""
    soup = BeautifulSoup(chapter.get_body_content(), 'html.parser')
    # Filtering out script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    text = ' '.join(para.get_text(strip=True) for para in soup.find_all('p'))
    return text

In [None]:
def is_wrapper(id):
    """
    Determine if the item is the Gutenberg cover wrapper
    """
    patterns = [
        r'coverpage',
        r'wrapper'
    ]

    return any(re.search(pattern, id, re.IGNORECASE) for pattern in patterns)

In [None]:
def is_gutenberg_intro(item_content):
    """
    Function to determine if the part of the content is Gutenberg introduction.
    """
    # Pattern matching common Gutenberg intro phrases
    patterns = [
        r'Project Gutenberg',
        r'\bEBook\b',
        r'\blicense\b',
        r'eBook or online at',
        r'This eBook is for the use of'
    ]
    return any(re.search(pattern, item_content, re.IGNORECASE) for pattern in patterns)

In [None]:
import re
from bs4 import BeautifulSoup
for spine_item in book.spine:
    item_id = spine_item[0] if isinstance(spine_item, tuple) else spine_item
    item = book.get_item_with_id(item_id)

    if isinstance(item, epub.EpubHtml) and not is_wrapper(item.get_id()):
        chapter_text = chapter_to_str(item)
        if chapter_text.strip():  # Check if the extracted text is not just whitespace
            print(f"Chapter ID: {item.get_id()}\nText:\n{chapter_text[:500]}...\n")
        else:
            print(f"Chapter ID: {item.get_id()} contains no readable content.")

In [None]:
filtered_chapters = []

for spine_item in book.spine:
    item_id = spine_item[0] if isinstance(spine_item, tuple) else spine_item
    item = book.get_item_with_id(item_id)
    if isinstance(item, epub.EpubHtml) and not is_wrapper(item.get_id()):
        chapter_text = chapter_to_str(item)
        if chapter_text.strip():
            filtered_chapters.append(chapter_text)
            print(f"Content from Item ID: {item.get_id()} added.")
        else:
            print(f"Item ID: {item.get_id()} contains no readable content.")
    else:
        print(f"Item ID: {item.get_id()} skipped as non-content.")


In [None]:
print("Number of filtered chapters:", len(filtered_chapters))
for chapter in filtered_chapters[:3]:
    print(chapter[:500], "...")  # Print the first 500 characters of each chapter for review


In [None]:
combined_text = "\n\n".join(filtered_chapters)  # Join all chapters, separated by two newlines

In [None]:
output_path = '/content/drive/My Drive/TORI_dataset/TORI_Book_blue_castle.txt'

with open(output_path, 'w', encoding='utf-8') as file:
    file.write(combined_text)

## BookNLP

In [None]:
pip install booknlp

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
from booknlp.booknlp import BookNLP
# Model parameters
model_params = {
    "pipeline": "entity,quote,supersense,event,coref",
    "model": "big"
}
# Initialize BookNLP with language and model parameters
booknlp = BookNLP("en", model_params)
# Input file to process
input_file = "/content/drive/My Drive/TORI_dataset/TORI_Book_blue_castle.txt"
# Output directory to store resulting files
output_directory = "/content/drive/My Drive/booknlp_output/TORI_blue_castle_BNLP"
# Identifier for output files
book_id = "TORI_blue_castle"
# Process the input file
booknlp.process(input_file, output_directory, book_id)

In [None]:
import json
from collections import Counter

In [None]:
def proc(filename):
    full_path = f'/content/drive/My Drive/{filename}'

    with open(full_path) as file:
        data = json.load(file)
    return data

In [None]:
data = proc("booknlp_output/TORI_little_woman_BNLP/TORI_blue_castle.book")

In [None]:
def get_counter_from_dependency_list(dep_list):
    counter=Counter()
    for token in dep_list:
        term=token["w"]
        tokenGlobalIndex=token["i"]
        counter[term]+=1
    return counter

In [None]:
for character in data["characters"]:
    possCounter=Counter()

    agentList=character["agent"]
    patientList=character["patient"]
    possList=character["poss"]
    modList=character["mod"]

    character_id=character["id"]
    count=character["count"]

    referential_gender_distribution=referential_gender_prediction="unknown"

    if character["g"] is not None and character["g"] != "unknown":
        referential_gender_distribution=character["g"]["inference"]
        referential_gender=character["g"]["argmax"]

    mentions=character["mentions"]
    proper_mentions=mentions["proper"]
    max_proper_mention=""

    # print out the characters with proper names, along with their syntactic information
    if len(mentions["proper"]) > 0:
        max_proper_mention=mentions["proper"][0]["n"]

        print(character_id, count, max_proper_mention, referential_gender)

        print()
        printTop=20
        for k, v in get_counter_from_dependency_list(possList).most_common(printTop):
            print("\tposs\t%s %s" % (v,k))
        print()
        for k, v in get_counter_from_dependency_list(agentList).most_common(printTop):
            print("\tagent\t%s %s" % (v,k))
        print()
        for k, v in get_counter_from_dependency_list(patientList).most_common(printTop):
            print("\tpatient\t%s %s" % (v,k))
        print()
        for k, v in get_counter_from_dependency_list(modList).most_common(printTop):
            print("\tmod\t%s %s" % (v,k))
        print()