In [93]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
import plotly.express as px
from community import community_louvain
import os
import re 


## spaCy model 

In [95]:
# Load English language spaCy model 
NER = spacy.load("en_core_web_sm")
# Get all book 'txt' files 
books_LOTR = [file for file in os.scandir('data') if '.txt' in file.name]
book_third = books_LOTR[1]
print(books_LOTR)

# Read book
with open(book_third, 'r', encoding='latin-1') as f:
    book_text = f.read()

# The model processes the text and identifies any named entities such as persons, organizations, locations, etc.
book_NER = NER(book_text)

[<DirEntry 'The Fellowship of the Ring.txt'>, <DirEntry 'The Return of the King.txt'>, <DirEntry 'The Two Towers.txt'>]


## Get Characters Data

In [None]:
# Read data with characters
characters = pd.read_csv("CharactersLOTR.csv")
# Removes rows where the 'character' column starts with 'Category:'
characters = characters[~characters['character'].str.startswith('Category:')]
# Remove brackets and text within brackets
characters['character'] = characters['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) 
# Remove any text within parentheses in each row of the 'character' column
characters['character_firstname'] = characters['character'].str.split(n=1).str[0]
characters.at[20, 'character_firstname'] = characters.at[20, 'character_firstname'].replace("Samwise", "Sam")
characters

Unnamed: 0,character,character_firstname
0,Aragorn II,Aragorn
1,Arwen,Arwen
2,Bilbo Baggins,Bilbo
3,Frodo Baggins,Frodo
4,Beregond,Beregond
5,Bergil,Bergil
6,Fredegar Bolger,Fredegar
7,Tom Bombadil,Tom
8,Boromir,Boromir
9,Meriadoc Brandybuck,Meriadoc


In [None]:
sent_ent = []
# Store the sentence text along with the list of named entities
for sentence in book_NER.sents:
    entity_list = []
    for entity in sentence.ents:
        entity_list.append(entity.text)
    sent_ent.append({"sentence": sentence, "entities": entity_list})
sent_ent_df = pd.DataFrame(sent_ent)

In [None]:
# Function to filter out entities with non-characters
def filter(ent_list, character_df):
    character_set = set(character_df['character'])
    first_name_set = set(character_df['character_firstname'])
    filtered_entities = []
    for ent in ent_list:
        if ent in character_set or ent in first_name_set:
            filtered_entities.append(ent)
    return filtered_entities


# Filter entities based on characters
sent_ent_df['character_entities'] = sent_ent_df['entities'].apply(lambda x: filter(x, characters))
# Filter rows with non-empty character_entities
sent_ent_filtered = sent_ent_df[sent_ent_df['character_entities'].apply(lambda x: len(x) > 0)]
# Extract first word from each entity
sent_ent_filtered['character_entities'] = sent_ent_filtered['character_entities'].apply(lambda x: [item.split()[0] for item in x ])





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
# Identify relationships by checking if characters occur by 4 sentences
size = 4
relationships_list = []


for i in range(sent_ent_filtered.index[-1]):
    end = min(i+size, sent_ent_filtered.index[-1])
    characters_list = sum((sent_ent_filtered.loc[i: end].character_entities), [])
    
    # List of unique characters
    characters_unique = []
    for i in range(len(characters_list)):
        if i == 0 or characters_list[i] != characters_list[i - 1]:
            characters_unique.append(characters_list[i])
    # If number of characters is more than 1
    if len(characters_unique) > 1:
        for x, source in enumerate(characters_unique[:-1]):
            target = characters_unique[x + 1]
            relationships_list.append({"source": source, "target": target})

relationships_df = pd.DataFrame(relationships_list)
# Value of relationships
relationships_df["value"] = 1
relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()
    

## Create a graph

In [None]:
G = nx.from_pandas_edgelist(relationships_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

In [None]:
# Network Graph
net = Network(notebook = True, width="1350px", height="700px", bgcolor='#0f0f0f', font_color='white')
node_degree = dict(G.degree)
nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("LordOfTheRingsTheReturnoftheKing.html")

LordOfTheRings2.html


## Centrality

In [None]:
degree= nx.degree_centrality(G)
degree_df = pd.DataFrame.from_dict(degree, orient='index', columns=['centrality'])
# Plot top 10 nodes
top_nodes = degree_df.sort_values('centrality', ascending=False).head(10)
fig = px.bar(top_nodes, x=top_nodes.index, y='centrality')
fig.show()

# Betweenness centrality


In [None]:
# Betweenness centrality
betweenness = nx.betweenness_centrality(G)
betweenness_df = pd.DataFrame.from_dict(betweenness, orient='index', columns=['centrality'])
# Plot top 10 nodes
top_nodes = betweenness_df.sort_values('centrality', ascending=False).head(10)
fig_betweenness = px.bar(top_nodes, x=top_nodes.index, y='centrality')
fig_betweenness.show()

## Closeness centrality


In [None]:
# Closeness centrality
closeness = nx.closeness_centrality(G)
closeness_df = pd.DataFrame.from_dict(closeness, orient='index', columns=['centrality'])
# Plot top 10 nodes
top_nodes = closeness_df.sort_values('centrality', ascending=False).head(10)
fig_closeness= px.bar(top_nodes, x=top_nodes.index, y='centrality')
fig_closeness.show()

In [None]:
best_partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, best_partition, 'group')
com_net = Network(notebook = True, width="1350px", height="700px", bgcolor='#222222', font_color='white')
com_net.from_nx(G)
com_net.show("LOTR3_communities.html")

LOTR_communities2.html
