# Named Entity Recognition

In [50]:
import spacy
from nltk import sent_tokenize
from glob import glob
import pandas as pd
import os
import sys
import pathlib
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [3]:
!python -m spacy download en_core_web_trf

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     -------------------------------------- 0.0/457.4 MB 991.0 kB/s eta 0:07:42
     -------------------------------------- 0.0/457.4 MB 991.0 kB/s eta 0:07:42
     -------------------------------------- 0.1/457.4 MB 469.7 kB/s eta 0:16:14
     -------------------------------------- 0.1/457.4 MB 656.4 kB/s eta 0:11:37
     -------------------------------------- 0.1/457.4 MB 554.9 kB/s eta 0:13:45
     -------------------------------------- 0.2/457.4 MB 654.6 kB/s eta 0:11:39
     -------------------------------------- 0.2/457.4 MB 615.9 kB/s eta 0:12:23
     -------------------------------------- 0.3/457.4 MB 791.9 kB/s eta 0:09:38
     ------------------------


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip



     ---------------- ------------------- 212.0/457.4 MB 642.5 kB/s eta 0:06:22
     ---------------- ------------------- 212.0/457.4 MB 643.1 kB/s eta 0:06:22
     ---------------- ------------------- 212.0/457.4 MB 643.1 kB/s eta 0:06:22
     ---------------- ------------------- 212.0/457.4 MB 643.1 kB/s eta 0:06:22
     ---------------- ------------------- 212.1/457.4 MB 640.6 kB/s eta 0:06:23
     ---------------- ------------------- 212.1/457.4 MB 640.6 kB/s eta 0:06:23
     ---------------- ------------------- 212.2/457.4 MB 642.5 kB/s eta 0:06:22
     ---------------- ------------------- 212.2/457.4 MB 643.8 kB/s eta 0:06:21
     ---------------- ------------------- 212.2/457.4 MB 643.7 kB/s eta 0:06:21
     ---------------- ------------------- 212.2/457.4 MB 643.1 kB/s eta 0:06:22
     ---------------- ------------------- 212.3/457.4 MB 643.8 kB/s eta 0:06:21
     ---------------- ------------------- 212.3/457.4 MB 643.1 kB/s eta 0:06:22
     ---------------- -----------------

## Load the model

In [21]:
def load_model():
    nlp = spacy.load('en_core_web_trf')
    return nlp

In [22]:
nlp_model = load_model()

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [5]:
def load_subtitles_dataset(dataset_path):
    checker = dataset_path.split('.')[-1]
    if checker == 'ass':
        subtitles_paths = [dataset_path]
    else:
        subtitles_paths = glob(dataset_path + '/*.ass')
    scripts = []
    episode_numbers = []
    for path in subtitles_paths:
        # Read lines
        with open(path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines = [','.join(line.split(',')[9:]) for line in lines]
            lines = [line.replace('\\N', ' ') for line in lines]
        
        script = " ".join(lines)    
        scripts.append(script)
        
        episode_number = int(path.split('-')[-1].split('.')[0].strip())
        episode_numbers.append(episode_number) 
    
    df = pd.DataFrame.from_dict({'episode':episode_numbers, 'script':scripts})
    
    return df

## Load the dataset

In [11]:
dataset_path = '../data/subtitles'
df = load_subtitles_dataset(dataset_path)
df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."


In [12]:
sample_script = df.iloc[0]['script']
sample_script

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [13]:
sentences = sent_tokenize(sample_script)

In [27]:
sentences= sentences[60:90]

In [28]:
sentence = ".".join(sentences)
sentence

"Don't you know who the Hokage leaders are?.Of course, I do!.I know they earned the title Lord Hokage\n because they were the best Ninja of their time, right?.Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox..Then why did you do that?.Because I'll become a Hokage myself..And I'll be the greatest Hokage of all time!.So that everyone will finally learn to accept me!.By the way, Sensei, I have a favor to ask..You want another bowl?.Mmmm…No…\n Can I borrow that Leaf headband for a while?.This?.No no!.This is worn only by those who have graduated from Ninja Academy..Tomorrow, you will…\n You're so mean!.So that's why you took off your goggles…\n Humph... One more bowl please!.We are now about to begin the graduation test..When your name is called, proceed to the next classroom..The test is on the Clone Jutsu..Oh no…\n Of all the…!.That is my weakest Jutsu!.But still… I will do it no matter what!.Clone Jutsu!.Disqualified!.Iruka Sensei..His physica

# Run Model

In [29]:
doc = nlp_model(sentence)

In [30]:
doc.ents

(Ninja, Fourth, nine, Leaf, Ninja Academy, Tomorrow, One, three, Naruto, one)

In [32]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Ninja NORP
Fourth ORDINAL
nine CARDINAL
Leaf PERSON
Ninja Academy ORG
Tomorrow DATE
One CARDINAL
three CARDINAL
Naruto PERSON
one CARDINAL


In [37]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)
    ner_output = []
    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == 'PERSON':
                full_name = entity.text
                first_name = entity.text.split(' ')[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)
    return ner_output

In [38]:
df = df.head(10)

In [39]:
df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas..."


In [40]:
df['ners'] = df['script'].apply(get_ners_inference)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ners'] = df['script'].apply(get_ners_inference)


In [41]:
df

Unnamed: 0,episode,script,ners
0,1,"A long time ago, a powerful demon fox appeared...","[{}, {}, {}, {}, {}, {}, {}, {Naruto}, {}, {},..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Konohama..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Sakura, Sasuke}, {}, {Konohamaru..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Naruto}, {}, {}, {Iruka}, {}, {N..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {Sasuke}, {Sakura}, {Naruto}, {}, {Na..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {Sasuke}, {}, {},..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {Zabuza}, {}, {}, {}, {Naruto..."


# Character network

In [44]:
def generate_character_network(df):
    
    window_size = 10
    entity_relationship = []
    
    for row in df['ners']:
        
        previous_entities_in_window = []
        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-window_size:]
            
            # flatten the 2D list into 1D list
            previous_entities_flattened = sum(previous_entities_in_window, [])
            
            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value':entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)
    
    return relationship_df

In [45]:
relationship_df = generate_character_network(df)

In [46]:
relationship_df

Unnamed: 0,source,target,value
125,Naruto,Sasuke,117
152,Sakura,Sasuke,65
67,Iruka,Naruto,43
124,Naruto,Sakura,41
118,Mizuki,Naruto,28
...,...,...,...
98,Kakashi,Sharingan,1
91,Jonin,Zabuza,1
87,Jonin,Manji,1
75,Jerk,Sakura,1


In [47]:
relationship_df = relationship_df.sort_values('value', ascending=False)

In [48]:
relationship_df = relationship_df.head(200)


In [49]:
G = nx.from_pandas_edgelist(
    df=relationship_df,
    source='source',
    target='target',
    edge_attr='value',
    create_using=nx.Graph()
)

In [51]:
net = Network(notebook=True, height='700px', width='1000px', bgcolor='#222222', font_color='white', cdn_resources='remote')
node_degree = dict(G.degree)
nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show('Naruto.html')

Naruto.html
