In [13]:
import pandas as pd
import spacy
import networkx as nx
from pyvis.network import Network
import plotly.express as px
from community import community_louvain


## spaCy model 

In [14]:
# Load English language spaCy model 
NER = spacy.load("en_core_web_sm")
# Get all book 'txt' files 
books_LOTR = [file for file in os.scandir('data') if '.txt' in file.name]
book_third = books_LOTR[2]
print(books_LOTR)

# Read book
with open(book_third, 'r', encoding='latin-1') as f:
    book_text = f.read()

# The model processes the text and identifies any named entities such as persons, organizations, locations, etc.
book_NER = NER(book_text)

[<DirEntry 'The Fellowship of the Ring.txt'>, <DirEntry 'The Return of the King.txt'>, <DirEntry 'The Two Towers.txt'>]


## Get Characters Data

In [15]:
# Read data with characters
characters = pd.read_csv("CharactersLOTR.csv")
# Removes rows where the 'character' column starts with 'Category:'
characters = characters[~characters['character'].str.startswith('Category:')]
# Remove brackets and text within brackets
characters['character'] = characters['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) 
# Remove any text within parentheses in each row of the 'character' column
characters['character_firstname'] = characters['character'].str.split(n=1).str[0]
characters.at[9, 'character_firstname'] = characters.at[9, 'character_firstname'].replace("Meriadoc", "Merry")
characters.at[20, 'character_firstname'] = characters.at[20, 'character_firstname'].replace("Samwise", "Sam")
characters.at[45, 'character_firstname'] = characters.at[45, 'character_firstname'].replace("Peregrin", "Pippin")
characters

Unnamed: 0,character,character_firstname
0,Aragorn II,Aragorn
1,Arwen,Arwen
2,Bilbo Baggins,Bilbo
3,Frodo Baggins,Frodo
4,Beregond,Beregond
5,Bergil,Bergil
6,Fredegar Bolger,Fredegar
7,Tom Bombadil,Tom
8,Boromir,Boromir
9,Meriadoc Brandybuck,Merry


In [16]:
sent_ent = []
# Store the sentence text along with the list of named entities
for sentence in book_NER.sents:
    entity_list = []
    for entity in sentence.ents:
        entity_list.append(entity.text)
    sent_ent.append({"sentence": sentence, "entities": entity_list})
sent_ent_df = pd.DataFrame(sent_ent)
sent_ent_df

Unnamed: 0,sentence,entities
0,"(\n\n\n\n\n \n\n, THETWO, \n, TOWERS, \n, bei...","[second, RINGS, TOLKIEN, Elven, Seven, Dwarf, ..."
1,"(One, Ring, to, rule, them, all, ,, One, Ring,...","[One, One, One, the Land of Mordor, Shadows]"
2,"(SYNOPSIS, \n, This, is, the, second, part, of...","[SYNOPSIS, second, Rings]"
3,"(The, first, part, ,, The, Fellowship, of, the...","[first, The Fellowship of the Ring, Gandalf th..."
4,"(It, recounted, the, flight, of, Frodo, and, h...","[Frodo, Shire, the Black Riders of Mordor, Ara..."
...,...,...
12121,"(The, Trustees, of, the, J.R.R.Tolkien, 1967, ...","[Tolkien'(r, The J.R.R. Tolkien Estate Limited..."
12122,"(By, payment, of, the, required, fees, ,, you,...",[]
12123,"(No, part, of, this, text, may, be, reproduced...",[HarperCollins]
12124,"(ABOUT, THE, PUBLISHER, \n, Australia, \n, Har...","[PUBLISHER, Australia, HarperCollins Publisher..."


In [17]:
# Function to filter out entities with non-characters
def filter(ent_list, character_df):
    character_set = set(character_df['character'])
    first_name_set = set(character_df['character_firstname'])
    filtered_entities = []
    for ent in ent_list:
        if ent in character_set or ent in first_name_set:
            filtered_entities.append(ent)
    return filtered_entities


# Filter entities based on characters
sent_ent_df['character_entities'] = sent_ent_df['entities'].apply(lambda x: filter(x, characters))
# Filter rows with non-empty character_entities
sent_ent_filtered = sent_ent_df[sent_ent_df['character_entities'].map(len) > 0].copy()

# Extract first word from each entity
sent_ent_filtered['character_entities'] = sent_ent_filtered['character_entities'].apply(lambda x: [item.split()[0] for item in x])

sent_ent_filtered


Unnamed: 0,sentence,entities,character_entities
4,"(It, recounted, the, flight, of, Frodo, and, h...","[Frodo, Shire, the Black Riders of Mordor, Ara...","[Frodo, Elrond]"
5,"(There, was, held, the, great, Council, of, El...","[Council of Elrond, Frodo]",[Frodo]
7,"(In, this, fellowship, were, Aragorn, and, Bor...","[Aragorn, Boromir, Men, Legolas, Elven, Mirkwo...","[Aragorn, Boromir, Legolas, Gimli]"
8,"(The, Companions, journeyed, in, secret, far, ...","[Rivendell, North, Caradhras, winter, Gandalf,...",[Gandalf]
11,"(Already, they, had, become, aware, that, thei...","[Gollum, two]",[Gollum]
...,...,...,...
12102,"(Sam, heard, a, burst, of, hoarse, singing, ,,...",[Sam],[Sam]
12103,"(Gorbag, and, Shagrat, were, already, on, the,...","[Gorbag, Shagrat]","[Gorbag, Shagrat]"
12104,"(Sam, yelled, and, brandished, Sting, ,, but, ...","[Sam, Sting]",[Sam]
12111,"(Sam, hurled, himself, against, the, bolted, b...",[Sam],[Sam]


In [18]:
# Identify relationships by checking if characters occur by 4 sentences
size = 4
relationships_list = []


for i in range(sent_ent_filtered.index[-1]):
    end = min(i+size, sent_ent_filtered.index[-1])
    characters_list = sum((sent_ent_filtered.loc[i: end].character_entities), [])
    
    # List of unique characters
    characters_unique = []
    for i in range(len(characters_list)):
        if i == 0 or characters_list[i] != characters_list[i - 1]:
            characters_unique.append(characters_list[i])
    # If number of characters is more than 1
    if len(characters_unique) > 1:
        for x, source in enumerate(characters_unique[:-1]):
            target = characters_unique[x + 1]
            relationships_list.append({"source": source, "target": target})

relationships_df = pd.DataFrame(relationships_list)
# Value of relationships
relationships_df["value"] = 1
relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()
relationships_df
    

Unnamed: 0,source,target,value
0,Frodo,Elrond,10
1,Elrond,Frodo,4
2,Frodo,Aragorn,9
3,Aragorn,Boromir,13
4,Boromir,Legolas,22
...,...,...,...
176,Shagrat,Sam,14
177,Shagrat,Gorbag,15
178,Shelob,Shagrat,1
179,Gorbag,Sam,4


## Create a graph

In [19]:
G = nx.from_pandas_edgelist(relationships_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

In [20]:
# Network Graph
net = Network(notebook = True, width="1350px", height="700px", bgcolor='#0f0f0f', font_color='white')
node_degree = dict(G.degree)
nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("LordOfTheRings.html")

LordOfTheRings.html


## Centrality

In [21]:
degree= nx.degree_centrality(G)
degree_df = pd.DataFrame.from_dict(degree, orient='index', columns=['centrality'])
# Plot top 10 nodes
top_nodes = degree_df.sort_values('centrality', ascending=False).head(10)
fig = px.bar(top_nodes, x=top_nodes.index, y='centrality')
fig.show()

# Betweenness centrality


In [22]:
# Betweenness centrality
betweenness = nx.betweenness_centrality(G)
betweenness_df = pd.DataFrame.from_dict(betweenness, orient='index', columns=['centrality'])
# Plot top 10 nodes
top_nodes = betweenness_df.sort_values('centrality', ascending=False).head(10)
fig_betweenness = px.bar(top_nodes, x=top_nodes.index, y='centrality')
fig_betweenness.show()

## Closeness centrality


In [23]:
# Closeness centrality
closeness = nx.closeness_centrality(G)
closeness_df = pd.DataFrame.from_dict(closeness, orient='index', columns=['centrality'])
# Plot top 10 nodes
top_nodes = closeness_df.sort_values('centrality', ascending=False).head(10)
fig_closeness= px.bar(top_nodes, x=top_nodes.index, y='centrality')
fig_closeness.show()

In [24]:
best_partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, best_partition, 'group')
com_net = Network(notebook = True, width="1350px", height="700px", bgcolor='#222222', font_color='white')
com_net.from_nx(G)
com_net.show("LOTR_communities.html")


LOTR_communities.html
