Named Entity Recognition

In [45]:
import spacy
from nltk import sent_tokenize
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [2]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
     ---------------------------------------- 0.0/457.4 MB ? eta -:--:--
     --------------------------------------- 3.1/457.4 MB 16.8 MB/s eta 0:00:28
     --------------------------------------- 5.5/457.4 MB 12.9 MB/s eta 0:00:35
      -------------------------------------- 9.2/457.4 MB 14.6 MB/s eta 0:00:31
     - ------------------------------------ 12.3/457.4 MB 14.8 MB/s eta 0:00:31
     - ------------------------------------ 16.3/457.4 MB 15.3 MB/s eta 0:00:29
     - ------------------------------------ 19.7/457.4 MB 15.5 MB/s eta 0:00:29
     -- ----------------------------------- 24.1/457.4 MB 16.2 MB/s eta 0:00:27
     -- ----------------------------------- 28.3/457.4 MB 16.6 MB/s eta 0:00:26
     -- ----------------------------------- 32.2/457.4 MB 16.9 MB/s eta 0:00:26
     --- ------------------

In [3]:
#Load model

nlp_model = spacy.load('en_core_web_trf')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Load dataset
import sys
sys.path.append('../')
from utils import load_subtitles_dataset

In [23]:
dataset_path = '../data/naruto_subtitles'

df = load_subtitles_dataset(dataset_path)

In [24]:
df_example = df.iloc[0]['script']
df_example

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [26]:
sentences = sent_tokenize(df_example)
sentences[60:90]

["Don't you know who the Hokage leaders are?",
 'Of course, I do!',
 'I know they earned the title Lord Hokage\n because they were the best Ninja of their time, right?',
 'Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox.',
 'Then why did you do that?',
 "Because I'll become a Hokage myself.",
 "And I'll be the greatest Hokage of all time!",
 'So that everyone will finally learn to accept me!',
 'By the way, Sensei, I have a favor to ask.',
 'You want another bowl?',
 'Mmmm…No…\n Can I borrow that Leaf headband for a while?',
 'This?',
 'No no!',
 'This is worn only by those who have graduated from Ninja Academy.',
 "Tomorrow, you will…\n You're so mean!",
 "So that's why you took off your goggles…\n Humph... One more bowl please!",
 'We are now about to begin the graduation test.',
 'When your name is called, proceed to the next classroom.',
 'The test is on the Clone Jutsu.',
 'Oh no…\n Of all the…!',
 'That is my weakest Jutsu!',
 'But sti

In [27]:
sentence = ".".join(sentences)
sentence

'A long time ago, a powerful demon fox appeared with nine tails..With its powerful tails,\n it could smash mountains and create tidal waves..A band of Ninjas rose to defend their village from attack..We have to wait until the Fourth Hokage gets here!.We can\'t let it get any closer to our village!.One great Ninja was able to imprison the monster,\n but died in the process..This Ninja was known as… the Fourth Hokage..Naruto!.Why did you do such a thing?!.You\'re really gonna get it this time!.I don\'t care!.You know your problem?.You can\'t do the things I do!.Only I can do this!.I\'m better than all of you!.Believe it!.There\'s a problem, sir!.Lord Hokage!.What is it?.Did that Naruto do something again?.Yes..He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!.Wait!.Ha ha…\n Why should I?.Hey, Naruto!.How did you suddenly get here, lruka Sensei?.The question is what are you doing here when you should be in class now?.Now listen, Naruto..You failed 

In [28]:
#Run model
doc = nlp_model(sentence)

In [29]:
doc.ents

(nine,
 Ninjas,
 Ninja,
 Ninja,
 Fourth,
 Naruto,
 lruka Sensei?.The,
 Naruto,
 the Transformation Jutsu!.Even,
 Haruno,
 Sasuke Uchiha,
 Naruto Uzumaki,
 Naruto,
 Naruto,
 tonight,
 Naruto Uzumaki!.Naruto,
 Hokage,
 Ninja,
 Fourth,
 nine,
 Leaf,
 Ninja Academy,
 Tomorrow,
 Humph,
 One,
 three,
 Naruto,
 Ninja,
 Iruka,
 Iruka Sensei,
 Naruto,
 Iruka Sensei!.What,
 Hokage,
 Naruto,
 the Scroll of Sealing,
 The Scroll of Sealing?!.Let,
 first,
 First,
 Naruto,
 Naruto,
 the Scroll of Sealing,
 one,
 Jutsu,
 Sensei,
 Mizuki,
 Naruto,
 Mizuki,
 Iruka,
 12 years ago,
 Naruto,
 Iruka,
 Iruka,
 Naruto,
 Naruto,
 the Scroll of Sealing,
 Naruto,
 Naruto,
 Mizuki,
 The Scroll of Sealing,
 Nine-Tailed,
 Mizuki,
 quick!.Mizuki,
 Iruka,
 Iruka,
 Naruto,
 that Fox Spirit,
 Iruka Sensei,
 Naruto,
 Naruto Uzumaki,
 the Village Hidden in the Leaves!.You,
 Iruka Sensei,
 thousand,
 Demon Fox!.Shadow,
 one,
 lruka Sensei,
 Hokage,
 Naruto,
 Naruto,
 Sensei,
 Ninja,
 Fifth,
 3rd,
 My Name Is Konohamaru!")

In [30]:
for entity in doc.ents:
    print(entity.text,entity.label_)

nine CARDINAL
Ninjas NORP
Ninja NORP
Ninja PERSON
Fourth ORDINAL
Naruto PERSON
lruka Sensei?.The PERSON
Naruto PERSON
the Transformation Jutsu!.Even LAW
Haruno PERSON
Sasuke Uchiha PERSON
Naruto Uzumaki PERSON
Naruto PERSON
Naruto PERSON
tonight TIME
Naruto Uzumaki!.Naruto PERSON
Hokage PERSON
Ninja NORP
Fourth ORDINAL
nine CARDINAL
Leaf PERSON
Ninja Academy ORG
Tomorrow DATE
Humph PERSON
One CARDINAL
three CARDINAL
Naruto PERSON
Ninja NORP
Iruka PERSON
Iruka Sensei PERSON
Naruto PERSON
Iruka Sensei!.What PERSON
Hokage PERSON
Naruto PERSON
the Scroll of Sealing WORK_OF_ART
The Scroll of Sealing?!.Let WORK_OF_ART
first ORDINAL
First ORDINAL
Naruto PERSON
Naruto PERSON
the Scroll of Sealing WORK_OF_ART
one CARDINAL
Jutsu WORK_OF_ART
Sensei PERSON
Mizuki PERSON
Naruto PERSON
Mizuki PERSON
Iruka PERSON
12 years ago DATE
Naruto PERSON
Iruka PERSON
Iruka PERSON
Naruto PERSON
Naruto PERSON
the Scroll of Sealing WORK_OF_ART
Naruto PERSON
Naruto PERSON
Mizuki PERSON
The Scroll of Sealing WORK_O

In [37]:
def get_ners_inference(script):
    sentences = sent_tokenize(script)
    ner_op = []
    for line in sentences:
        doc = nlp_model(line)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == 'PERSON':
                fullname = entity.text
                firstname = fullname.split(" ")[0].strip()
                ners.add(firstname)
        ner_op.append(ners)
    return ner_op


In [38]:
df_example2 = df.head(10)
df_example2

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas..."


In [39]:
df_example2['ners'] = df_example2['script'].apply(get_ners_inference)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_example2['ners'] = df_example2['script'].apply(get_ners_inference)


In [48]:
df_example2

Unnamed: 0,episode,script,ners
0,1,"A long time ago, a powerful demon fox appeared...","[{}, {}, {}, {}, {}, {}, {}, {Naruto}, {}, {},..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Konohama..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Sakura, Sasuke}, {}, {Konohamaru..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Naruto}, {}, {}, {Iruka}, {}, {N..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {Sasuke}, {}, {Naruto}, {}, {Naruto},..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {Sasuke}, {}, {},..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {Zabuza}, {}, {}, {}, {Naruto..."


In [None]:
#character network
def generate_character_network(df):
    window_size = 10
    ER_list = []
    for row in df['ners']:
        entities_in_window = []
        for sentence in row:
            entities_in_window.append(list(sentence))
            entities_in_window = entities_in_window[-window_size:]    #take those in window
            #flatten list 2D -> 1D
            entities_in_window_flat = sum(entities_in_window,[])

            for entity in sentence:
                for window_entity in entities_in_window_flat:
                    if entity != window_entity:
                        ER_list.append(sorted([entity,window_entity]))    #sort as relation is not directed (same both ways)

    relation_df = pd.DataFrame({'value':ER_list})
    relation_df['source'] = relation_df['value'].apply(lambda x: x[0])
    relation_df['target'] = relation_df['value'].apply(lambda x: x[1])
    relation_df = relation_df.groupby(['source','target']).count().reset_index()
    relation_df = relation_df.sort_values('value',ascending=False)

    return relation_df


In [60]:
relation_df = generate_character_network(df_example2)
relation_df.head(50) #top 100 

Unnamed: 0,source,target,value
163,Naruto,Sasuke,122
202,Sakura,Sasuke,69
89,Iruka,Naruto,45
162,Naruto,Sakura,40
154,Mizuki,Naruto,29
123,Kakashi,Sasuke,26
120,Kakashi,Naruto,24
61,Hokage,Naruto,24
104,Jin,Tori,14
108,Jin,saru,13


In [61]:
#transform to network using networkx library

G = nx.from_pandas_edgelist(relation_df,source='source',target='target',edge_attr='value',create_using=nx.Graph())

# change it to pyvis network as it is more visually appealing. Also, networkx graphs may be messy with a lot of entities (200 here) and are laso not interactive.

net = Network(notebook=True,width='1000px',height='700px',bgcolor='#222222',font_color='white',cdn_resources='remote')
node_degree = dict(G.degree)    #how big the node will be

nx.set_node_attributes(G,node_degree,'size')
net.from_nx(G)
net.show('naruto.html')


naruto.html
