In [1]:
import re

import spacy
import nltk
import pandas as pd

import nltk.tag.stanford as st

from nltk.tree import Tree
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tokenize import word_tokenize
from spacy import displacy

nlp = spacy.load('en')

# NER function Based on nltk

In [2]:
def nltk_NER(sentence):
    Tree_NER=ne_chunk(pos_tag(word_tokenize(sentence)))
    ls_all_NER=[]
    for i in Tree_NER:
        if type(i) == Tree:
            label_t=i.label()
            nodes=i.leaves()
            text_label=""
            for index,item in enumerate(nodes):
                if index<len(nodes)-1:
                    text_label+=item[0]+" "
                else:
                    text_label+=item[0]
            if label_t=="GPE":
                ls_all_NER.append((text_label, "LOCATION"))
            elif label_t=="PERSON":
                ls_all_NER.append((text_label, "PERSON"))
    return ls_all_NER

# NER Function Based on Spacy

In [3]:
def Spacy_NER(sentence):
    ls_all_NER=[]
    doc = nlp(sentence)
    for X in doc.ents:
        if X.label_=="GPE":
            ls_all_NER.append((X.text, "LOCATION")) 
        elif X.label_=="PERSON":
            ls_all_NER.append((X.text, X.label_))
    return ls_all_NER  

# Load Text of Book

In [4]:
fname="./Data/War_and_Peace_Book_One.txt"
with open(fname) as f:
    content = f.readlines()

# Preprocess of Text

In [5]:
# Remove head and tail
if "About this digital edition\n" in content:
    content=content[:content.index("About this digital edition\n")]
if "CHAPTER I\n" in content:
    content=content[content.index("CHAPTER I\n"):]   
content=list(filter(lambda a: a != '\n', content))
text = " ".join(content)
text, _ = re.subn('\s+', ' ', text)

# Apply two NER Function on Data

In [6]:
%%time
location_person=nltk_NER(text)

CPU times: user 23.8 s, sys: 186 ms, total: 24 s
Wall time: 22.8 s


In [7]:
%%time
location_person_spacy=Spacy_NER(text)

CPU times: user 1min 40s, sys: 9.49 s, total: 1min 50s
Wall time: 13.7 s


# Intersection of Two Algorithms to Increase the Precision

In [8]:
nltk_lp_set=set(location_person)

In [9]:
Spacy_lp_set=set(location_person_spacy)

In [10]:
highpercision_lowerrecall_lp_set=nltk_lp_set.intersection(Spacy_lp_set)

# Create Data Frame of Result

In [11]:
df_lp=pd.DataFrame(list(highpercision_lowerrecall_lp_set))

In [12]:
df_lp.columns=["token","type"]

In [13]:
location_items=df_lp[df_lp["type"]=="LOCATION"]

In [14]:
Person_items=df_lp[df_lp["type"]=="PERSON"]

In [15]:
location_items.reset_index(drop=True,inplace=True)

In [16]:
Person_items.reset_index(drop=True,inplace=True)

# Save Result into Files

In [17]:
Person_items.to_csv("./Data/Persons.csv", index=None)

In [18]:
location_items.to_csv("./Data/Locations.csv", index=None)

# Print 10 Results for Person

In [19]:
Person_items.head(10)

Unnamed: 0,token,type
0,Boris,PERSON
1,Vasili,PERSON
2,Princess Helene,PERSON
3,Mortemart,PERSON
4,Prince Vasili,PERSON
5,Masha,PERSON
6,Olga,PERSON
7,Anna Mikhaylovna,PERSON
8,Ochakov,PERSON
9,Catherine,PERSON


# Print 10 Results for Location

In [20]:
location_items.head(10)

Unnamed: 0,token,type
0,Prince Hippolyte,LOCATION
1,Mademoiselle Bourienne,LOCATION
2,Natasha,LOCATION
3,Buonaparte,LOCATION
4,Venetian,LOCATION
5,England,LOCATION
6,London,LOCATION
7,America,LOCATION
8,Sweden,LOCATION
9,New York,LOCATION
