# Literary Universe Mapping for the Foundation Trilogy by Isaac Asimov

This project aims to develop a web tool allowing users to see the rela- tionships between the characters, locations, events, and sentiments in Isaac Asimov’s Foundation Trilogy.

This notebook performs the preliminary natural language processes in order to find the necessary information to visualise the trilogy.

## Imports

In [1]:
!pip install pymupdf
!pip install python-crfsuite
!pip install sklearn-crfsuite
!pip install num2words
!pip install svgling
!pip install accelerate -U
!pip install datasets

Collecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.4
Collecting python-crfsuite
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.10
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.

In [2]:
# Imports
import fitz # PyMuPDF
import re
import os
import math
from datasets import Dataset
from num2words import num2words
import pycrfsuite
from transformers import pipeline, BertTokenizer, BertModel, AutoModelForTokenClassification, AutoTokenizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
from nltk.corpus import words, stopwords, treebank, wordnet
from nltk.tag import CRFTagger, StanfordNERTagger
import string
from collections import Counter, defaultdict
from nltk import RegexpParser
from nltk.chunk import ne_chunk
import torch
from nltk.draw.tree import TreeView
from IPython.display import Image
import svgling
nltk.download()
nltk.download('words')
nltk.download('maxent_ne_chunker')
nltk_words = set(words.words())


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


## Preprocessing

This section cleans the text and prepares the trilogy for natural language processing. The books are sperated into chapters, they are tokenized and POS taggged, numbers are converted into words, stop words are removed, and they are lemmatized.

### Helper Functions

In [3]:
def extract_pages_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    pages = []
    for page in doc:
        pages.append(page.get_text())
    doc.close()
    return pages

def extract_chapters(text, headers):
    # Dictionary to store results
    results = {}

    # Loop through the list of headers, stopping at the second last item
    for i in range(len(headers) - 1):
        sub1 = headers[i]
        sub2 = headers[i + 1]
        try:
            # Extracting the portion of the string after the first occurrence of 'sub1'
            after_sub1 = text.split(sub1, 1)[1]

            # Extracting the portion of the string before the first occurrence of 'sub2' in the substring
            result = after_sub1.split(sub2, 1)[0]

            # Removing any leading/trailing whitespaces and storing the result in the dictionary with sub1 as the key
            results[sub1] = convert_numbers_to_words(result.strip())

        except IndexError:
            # Handling the case where the splits do not work as expected (e.g., 'sub1' or 'sub2' not found)
            results[sub1] = None  # Optionally use an empty string '' or a specific message like "No valid text found."

    # Capture the remaining text after the last header if it exists in the text
    if headers and headers[-1] in text:
        last_header_index = text.index(headers[-1]) + len(headers[-1])
        remaining_text = text[last_header_index:]
        results[headers[-1]] = convert_numbers_to_words(remaining_text.strip())

    return results

def remove_substring(original_string, substring_to_remove):
    # Replace the specified substring with an empty string
    modified_string = original_string.replace(substring_to_remove, "")
    return modified_string

def remove_escape_chr(text):
    cleaned_text = re.sub(r'\n+', ' ', text)  # Remove escape characters
    return cleaned_text

def convert_numbers_to_words(text):
    # Function to replace each match
    def replace_with_words(match):
        number = int(match.group())
        return num2words(number)

    # Replacing all occurrences of numbers in the text with their word equivalents
    cleaned_text = re.sub(r'\b\d+\b', replace_with_words, text)
    return cleaned_text

def remove_punct(text):
    clean_text = re.sub(r"[^0-9A-Za-z ]", "", text) # Remove punctuation including apostrophes
    return clean_text

def convert_to_lowercase(text):
    lowercased_text = text.lower()

    return lowercased_text

def join_pages(text):
    return ' '.join(text)

def treebank_tokenize(text):
    return TreebankWordTokenizer().tokenize(text)

def get_pos_tags(crftagger, tokenized_text):
    return crftagger.tag(tokenized_text)

def lemmatize(tokens_with_tags):

    def get_wordnet_pos(treebank_tag):
        # Mapping from CRF/Penn Treebank tags to WordNet tags
        tag_dict = {
            'NN': wordnet.NOUN, 'NNS': wordnet.NOUN, 'NNP': wordnet.NOUN, 'NNPS': wordnet.NOUN,
            'VB': wordnet.VERB, 'VBD': wordnet.VERB, 'VBG': wordnet.VERB, 'VBN': wordnet.VERB, 'VBP': wordnet.VERB, 'VBZ': wordnet.VERB,
            'JJ': wordnet.ADJ, 'JJR': wordnet.ADJ, 'JJS': wordnet.ADJ,
            'RB': wordnet.ADV, 'RBR': wordnet.ADV, 'RBS': wordnet.ADV,
        }
        # Default to NOUN if no match found
        return tag_dict.get(treebank_tag[:2], wordnet.NOUN)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []

    for word, tag in tokens_with_tags:
      wordnet_tag = get_wordnet_pos(tag)
      lemmatized_token = lemmatizer.lemmatize(word, pos=wordnet_tag)
      lemmatized_tokens.append(lemmatized_token)

    return lemmatized_tokens


### Load POS tagger model

In [4]:
taggerCRF = CRFTagger(verbose=True)
try:
    taggerCRF.set_model_file('model.crf.tagger')
except:
    train_data = treebank.tagged_sents()
    taggerCRF.train(train_data, 'model.crf.tagger')

### Process trilogy

In [5]:
# Extract the pdf by pages
trilogy_pages = extract_pages_from_pdf("the_foundation.pdf")

# Remove escape characters
for i, page in enumerate(trilogy_pages):
    trilogy_pages[i] = remove_escape_chr(page)

# Split up the book into chapters and join the pages
# It is known that the cover page of each book start at page 8, 168, and 331 respectively
books = []
books.append(join_pages(trilogy_pages[8:167]))
books.append(join_pages(trilogy_pages[167:331]))
books.append(join_pages(trilogy_pages[331:]))

# Now remove the contents of each book for easier processing
books[0] = remove_substring(books[0], 'Contents Introduction Part I The Psychohistorians Part II The Encyclopedists Part III The Mayors Part IV The Traders Part V The Merchant Princes ',)
books[1] = remove_substring(books[1], 'Contents PROLOGUE  PART I THE GENERAL 1. SEARCH FOR MAGICIANS 2. THE MAGICIANS 3. THE DEAD HAND 4. THE EMPEROR 5. THE WAR BEGINS 6. THE FAVORITE 7. BRIBERY 8. TO TRANTOR 9. ON TRANTOR 10. THE WAR ENDS PART II THE MULE 11. BRIDE AND GROOM 12. CAPTAIN AND MAYOR 13. LIEUTENANT AND CLOWN 14. THE MUTANT 15. THE PSYCHOLOGIST 16. CONFERENCE 17. THE VISI-SONOR 18. FALL OF THE FOUNDATION 19. START OF THE SEARCH 20. CONSPIRATOR 21. INTERLUDE IN SPACE 22. DEATH ON NEOTRANTOR 23. THE RUINS OF TRANTOR  24. CONVERT 25. DEATH OF A PSYCHOLOGIST 26. END OF THE SEARCH ')
books[2] = remove_substring(books[2], 'Contents PROLOGUE PART I SEARCH BY THE MULE 1. TWO MEN AND THE MULE First Interlude 2. TWO MEN WITHOUT THE MULE Second Interlude 3. TWO MEN AND A PEASANT Third Interlude 4. TWO MEN AND THE ELDERS Fourth Interlude 5. ONE MAN AND THE MULE 6. ONE MAN, THE MULE – AND ANOTHER Last Interlude PART II SEARCH BY THE FOUNDATION 7. ARCADIA 8. SELDON\'S PLAN 9. THE CONSPIRATORS 10. APPROACHING CRISIS 11. STOWAWAY 12. LORD  13. LADY 14. ANXIETY 15. THROUGH THE GRID 16. BEGINNING OF WAR 17. WAR 18. GHOST OF A WORLD 19. END OF WAR 20. "I KNOW ..." 21. THE ANSWER THAT SATISFIED 22. THE ANSWER THAT WAS TRUE ')

# List for storing chapter headers
chapter_headers = [
                ['THE STORY BEHIND THE "FOUNDATION" By ISAAC ASIMOV','PART I THE PSYCHOHISTORIANS', 'PART II THE ENCYCLOPEDISTS',
                    'PART III THE MAYORS', 'PART IV THE TRADERS', 'PART V THE MERCHANT PRINCES', 'ABOUT THE AUTHOR'],
                ['PROLOGUE', '1. SEARCH FOR MAGICIANS', '2. THE MAGICIANS','3. THE DEAD HAND', '4. THE EMPEROR', '5. THE WAR BEGINS',
                 '6. THE FAVORITE', '7. BRIBERY', '8. TO TRANTOR', '9. ON TRANTOR', '10. THE WAR ENDS', '11. BRIDE AND GROOM',
                 '12. CAPTAIN AND MAYOR', '13. LIEUTENANT AND CLOWN', '14. THE MUTANT', '15. THE PSYCHOLOGIST', '16. CONFERENCE',
                 '17. THE VISI-SONOR', '18. FALL OF THE FOUNDATION', '19. START OF THE SEARCH', '20. CONSPIRATOR', '21. INTERLUDE IN SPACE',
                 '22. DEATH ON NEOTRANTOR', '23. THE RUINS OF TRANTOR', '24. CONVERT', '25. DEATH OF A PSYCHOLOGIST', '26. END OF THE SEARCH'],
                ['Prologue', '1 Two Men and the Mule', '2 Two Men without the Mule', '3 Two Men and a Peasant','4  Two Men and the Elders',
                 '5 One Man and the Mule', '6 One Man, the Mule – and Another', '7 Arcadia', '8 Seldon\'s Plan', '9 The Conspirators',
                 '10 Approaching Crisis', '11 Stowaway', '12 Lord', '13 Lady', '14  Anxiety', '15 Through the Grid', '16  Beginning of War',
                 '17 War', '18 Ghost of a World', '19 End of War', '20  "I Know ..."', '21 The Answer That Satisfied','22 The Answer That Was True']
                   ]

# Now split the book up into chapters.
for i, book in enumerate(books):
    books[i] = extract_chapters(book, chapter_headers[i]) # Extract chapters per book

stop_words = set(stopwords.words('english')) # Stop words
tokenized_books = [[] for _ in range(len(books))]
clean_tokenized_books = [[] for _ in range(len(books))]
pos_tokenized_books = [[] for _ in range(len(books))]
lemmatized_books = [[] for _ in range(len(books))]
clean_lemmatized_books = [[] for _ in range(len(books))]

for i, book in enumerate(books):
    for chapter in book.values():
        t = treebank_tokenize(chapter) # Tokenize the chapter
        t_pos = get_pos_tags(taggerCRF,t) # Get the POS tags
        t_punct = treebank_tokenize(remove_punct(convert_to_lowercase(chapter))) # Tokenize convert to lowercase and remove punctuation of chapter
        t_stwd = [i for i in t_punct if i.lower() not in stop_words] # Remove stop words
        t_stwd_pos = get_pos_tags(taggerCRF,t_stwd)

        # Store them in lists
        tokenized_books[i].append(t) # Unclean tokenized chapters
        clean_tokenized_books[i].append(t_stwd) # Cleaned tokenized chapters
        pos_tokenized_books[i].append(t_pos) # Tokens with POS tags
        lemmatized_books[i].append(lemmatize(t_pos)) # Lemmatized unlceaned chapters
        clean_lemmatized_books[i].append(lemmatize(t_stwd_pos)) # Lemmatized cleaned chapters

In [6]:
# Create a Vocabulary
vocabulary = []
for book in clean_lemmatized_books:
    for chapter in book:
        vocabulary.append(sorted(set(chapter))) # Create vocabulary for each book

# Extract the unique strings from a nested list of vocabularies for each book.
unique_set = set()
for book_vocab in vocabulary:
    unique_set.update(book_vocab)
vocabulary[:] = list(unique_set)

## Named Entity Recognition (NER) with Hugging Face

### Helper Functions

In [7]:
def split_chapters_into_sentences(book):
    sentences = []
    for chapter in book.values():
        chapter_sentences = sent_tokenize(chapter)
        sentences.extend(chapter_sentences)
    return sentences

def extract_relations_and_events(tokens, entities):
    relations = []
    events = []

    # Extract named entities and their indices
    entity_indices = {entity['start']: entity for entity in entities}

    for i, token in enumerate(tokens):
        if i in entity_indices:
            entity = entity_indices[i]
            relations.append((entity['entity'], entity['word']))
            # Example: Identify simple events based on verb context (basic example)
            if i > 0 and tokens[i-1] == 'is':
                events.append((tokens[i-1], token))

    return relations, events

### NER

In [8]:
from google.colab import userdata
userdata.get('HF_TOKEN')

# Ensure CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load a pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model.to(device)  # Move the model to GPU if available
# Create a pipeline for named entity recognition, specifying the device
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)



tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Split books into sentences
books_sentences = [split_chapters_into_sentences(book) for book in books]

# Flatten the list of books into a single list of sentences
all_sentences = [sentence for book in books_sentences for sentence in book]

# Create a dataset from the sentences
dataset = Dataset.from_dict({"text": all_sentences})

In [None]:
# Function to apply NER to each sentence
def apply_ner(examples):
    ner_results = ner_pipeline(examples["text"])
    return {"ner_results": ner_results}

# Apply NER to the dataset
ner_results = dataset.map(apply_ner, batched=True, batch_size=8)

# # Reconstruct the results into the original structure
# classified_trilogy = []
# index = 0
# for book in books_sentences:
#     classified_chapters = []
#     for _ in book:
#         classified_chapters.append(ner_results["ner_results"][index])
#         index += 1
#     classified_trilogy.append(classified_chapters)

Map:   0%|          | 0/18376 [00:00<?, ? examples/s]

### Relationship and Event Extraction

In [None]:
# Function to extract events and relationships
def extract_events_and_relationships(ner_results, sentences):
    events = []
    relationships = []

    for entities, sentence in zip(ner_results, sentences):
        tokens = treebank_tokenize(sentence)
        pos_tags = get_pos_tags(tokens)
        chunks = nltk.ne_chunk(pos_tags)

        # Extract entities and map them to their types
        entity_dict = defaultdict(list)
        for entity in entities:
            entity_type = entity['entity']
            entity_word = entity['word']
            entity_dict[entity_type].append(entity_word)

        # Extract relationships based on patterns
        for subtree in chunks:
            if isinstance(subtree, nltk.Tree):
                label = subtree.label()
                leaves = [word for word, pos in subtree.leaves()]
                if label in entity_dict:
                    for entity_type, entity_words in entity_dict.items():
                        if any(word in entity_words for word in leaves):
                            relationships.append((label, leaves, entity_type, entity_words))

        # Extract simple events (basic example using verbs)
        for i, (word, pos) in enumerate(pos_tags):
            if pos.startswith('VB'):
                subject = None
                object_ = None
                if i > 0:
                    subject = pos_tags[i-1][0]
                if i < len(pos_tags) - 1:
                    object_ = pos_tags[i+1][0]
                events.append((word, subject, object_))

    return events, relationships

In [None]:
# Extract events and relationships
all_events, all_relationships = extract_events_and_relationships(ner_results["ner_results"], all_sentences)

## Sentiment Analysis

In [92]:
# Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def extract_sentiments(text):
    sentiments = sentiment_pipeline(text)
    return sentiments

# Extract sentiments for each chapter
chapter_sentiments = []
for book in books:
    book_sentiments = []
    for chapter in book.values():
        sentiments = extract_sentiments(chapter)
        book_sentiments.append(sentiments)
    chapter_sentiments.append(book_sentiments)

# Output chapter sentiments for verification
print("Chapter Sentiments:", chapter_sentiments)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4247 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (4247) must match the size of tensor b (512) at non-singleton dimension 1

## Named Entity Recognition (NER) with Stanford NER

In [None]:
# Get Stanford NER
# !wget 'https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip'
# !unzip stanford-ner-4.2.0.zip

# Set the JAVAHOME environment variable (Google Colab has Java installed by default)
os.environ['JAVAHOME'] = "/usr/bin/java"

# Paths to the Stanford NER java executable and the classifier model
stanford_classifier = '/content/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = '/content/stanford-ner-2020-11-17/stanford-ner.jar'

# Initialize the Stanford NER tagger
st_ner = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

In [None]:
classified_trilogy = [[] for i in range(len(tokenized_book_chapters))]
for i, book in enumerate(tokenized_book_chapters):
    print("book",i+1)
    for chapter in book:
        print('chapter')
        classified_trilogy[i].append(st_ner.tag(chapter))

book 1
chapter
chapter


KeyboardInterrupt: 

In [None]:
classified_trilogy[0]

## TF-IDF

In [None]:
def calculate_document_frequency(books):
    doc_freq = Counter()
    for book in books:
        for chapter in book:
            unique_words = set(chapter)  # Remove duplicates within the chapter
            doc_freq.update(unique_words)
    return doc_freq


def calculate_tf_idf(books):
    # Calculate document frequency using Counter for optimization
    df = calculate_document_frequency(books)
    total_chapters = sum(len(book) for book in books)  # Total number of chapters across all books

    tf_idf_books = []

    for book in books:
        tf_idf_chapters = []
        for chapter in book:
            # Calculate term frequency using Counter
            term_count = Counter(chapter)
            chapter_length = len(chapter)
            tf_idf_scores = {}

            for word, count in term_count.items():
                tf = count / chapter_length
                idf = math.log10(total_chapters / df[word])
                tf_idf_scores[word] = math.log(1 + tf) * idf

            tf_idf_chapters.append(tf_idf_scores)
        tf_idf_books.append(tf_idf_chapters)

    return tf_idf_books

def get_top_tf_idf_scores(books, top_n=50):
    # Calculate TF-IDF for all books
    tf_idf_results = calculate_tf_idf(books)

    # Flatten the results to a single list with additional context
    all_scores = []
    for book_index, book in enumerate(tf_idf_results):
        for chapter_index, chapter in enumerate(book):
            for word, score in chapter.items():
                all_scores.append((book_index, chapter_index, word, score))

    # Sort by score in descending order
    sorted_scores = sorted(all_scores, key=lambda x: x[3], reverse=True)

    # Select top N scores ensuring no word duplicates
    top_scores = []
    seen_words = set()
    for score in sorted_scores:
        if score[2] not in seen_words:
            top_scores.append(score)
            seen_words.add(score[2])
            if len(top_scores) == top_n:
                break

    return top_scores

In [None]:
tf_idf_scores = get_top_idf_scores(books)

In [None]:
# Plot frequency distribution of words in each chapter of each book
for book in books:
    freq_dist = nltk.FreqDist(book)
    print(freq_dist.most_common(50))
    freq_dist.plot(500)

['grape', 'orange', 'banana', 'apple']


## Visualisations