In [None]:
import spacy
from nltk import sent_tokenize

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [None]:
nlp_model = load_model()

In [None]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [None]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [None]:
df.head()

In [None]:
sample_script = df.iloc[0]['script']
sample_script

In [None]:
sentences = sent_tokenize(sample_script)

In [None]:
sentences = sentences[60:90]

In [None]:
sentence = ".".join(sentences)

In [None]:
sentence

In [None]:
doc = nlp_model(sentence)

In [None]:
doc.ents

In [None]:
for entity in doc.ents:
    print(entity, entity.label_)

In [None]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ =="PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    return ner_output

In [None]:
df = df.head(10)

In [None]:
df

In [None]:
df['ners'] = df['script'].apply(get_ners_inference)

In [None]:
df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [None]:
def generate_character_network(df):

    windows=10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df



In [None]:
relationship_df = generate_character_network(df)

In [None]:
relationship_df

In [None]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)

In [None]:
G = nx.from_pandas_edgelist(
    relationship_df, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("naruto.html")
