# Eyre

Analyze characters and locations in stories using Named-entity recognition (NER) algorithms. 

Compare precision, recall, F1, (other metric) for different NER algorithms based on list of characters defined in Wikipedia

## List of characters

(Note: the Hatter is never referred to as the Mad Hatter in the book)

In [None]:
s = """Alice
Bill the Lizard
Caterpillar
Cheshire Cat
Dodo
Dormouse
Duchess
Duck
Eaglet
Gryphon
Hatter
King of Hearts
Knave of Hearts
Lory
March Hare
Mock Turtle
Mouse
Pat
Puppy
Queen of Hearts
White Rabbit"""
characters = s.lower().split('\n')
print(characters)

## spacy

In [None]:
import spacy
import pandas as pd

In [None]:
# load spacy's english model
nlp = spacy.load('en')

In [None]:
# define text to load
#filename = '../texts/alice1.txt'
filename = '../texts/alice.txt'

In [None]:
# read and parse text using spacy
s = open(filename).read()
doc = nlp(s)

In [None]:
df = pd.DataFrame(columns = ['start','label','text'])

In [None]:
#labels = ['PERSON','LOC','GPE']
labels = ['PERSON']

rows = []
for ent in doc.ents:
    if ent.label_ in labels:
        #print(ent.start, ent.label_, ent.text.strip())
        row = {}
        row['start'] = ent.start
        row['label'] = ent.label_
        row['text'] = ent.text.lower().strip()
        rows.append(row)
df = pd.DataFrame(rows, columns=['start','label','text'])

In [None]:
df.head()

In [None]:
#df.groupby('text').count()
#df.groupby(['label','text']).count()
g = df.groupby(['label','text']).count().sort_values(by='start',ascending=False)

In [None]:
#g[g['start']>1]
g[:10]

## NLTK Named Entity Chunker

In [None]:
import nltk
import pandas as pd

In [None]:
sentences = nltk.sent_tokenize(s)

In [None]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [None]:
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

In [None]:
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]

In [None]:
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
    for tagged_tree in ne_tagged_sentence:
        if hasattr(tagged_tree, 'label'):
            entity_name = ' '.join(c[0] for c in tagged_tree.leaves()).lower()
            entity_type = tagged_tree.label()
            named_entities.append((entity_name, entity_type))

In [None]:
#named_entities = list(set(named_entities))
df = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
df2 = pd.DataFrame(df.groupby('Entity Name').size().rename('count')).sort_values(by='count',ascending=False)

## NLTK / Stanford NER

In [None]:
from nltk.tag import StanfordNERTagger
import os

In [None]:
java_path = r'C:\Program Files\Java\jdk1.8.0_111\bin\java.exe'
os.environ['JAVAHOME'] = java_path

In [None]:
sn = StanfordNERTagger('c:/users/bburns/desktop/stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz',
                      path_to_jar='c:/users/bburns/desktop/stanford-ner-2016-10-31/stanford-ner.jar')


In [None]:
#ne_annotated_sentences = [sn.tag(sent) for sent in tokenized_sentences[:5]]
o = sn.tag(s)
o

In [None]:
named_entities = []
for sentence in ne_annotated_sentences:
    temp_entity_name = ''
    temp_named_entity = None
    for term, tag in sentence:
        if tag != '0':
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

In [None]:
df3 = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])

In [None]:
df3