# Hands-on NER use-case
This example is a hands-on example of NER setup and training constructed for the PyData DC meetup

In [None]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re
import pickle

required = {'spacy', 'transformers'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import json
import numpy as np
import pandas as pd
import spacy
from spacy.matcher import Matcher, PhraseMatcher

from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
import transformers

## Named-Entity Recognition
This application uses the [IMDB movie spoilers dataset](https://www.kaggle.com/rmisra/imdb-spoiler-dataset) as well as data sourced from Wikipedia.  We'll be generating a training set using SpaCy's PatternMatcher and then training a NER model from scratch using that.  It should give you a general idea of one possible implementation here and a codebase on which you can build

In [None]:
# how to use spaCy's PhraseMatcher
# adapted from https://spacy.io/usage/rule-based-matching#phrasematcher
# load a language model
from spacy.lang.en import English
en = English()
matcher = PhraseMatcher(en.vocab)
inventories = {'people': ["Barack Obama", "Angela Merkel"],
           'locations': ["Washington, D.C.", "Oval Office"]}
for k in inventories:           
    matcher.add(k, # label for the match 
                None, # no custom function for dealing with matches ("callback")
                *[en(text) for text in inventories[k]])

doc = en("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    # note: match_id is stored as an index within the vocab
    print(match_id, en.vocab[match_id].text, span.text)

Using PhraseMatcher to identify entities from the inventory in the spoilers dataset

In [None]:
# read in reviews descriptions from spoilers dataset
reviews = pd.read_pickle('./data/spoilers_reviews.pkl.gz')
# an inventory of 288 academy award nominated actors 
actor_inventory = pickle.load(open('./data/actors_inventory.pkl', 'rb'))
# a set of US cities
city_inventory = pickle.load(open('./data/city_inventory.pkl', 'rb'))
# splitting off State
city_inventory = [x.split(',')[0] for x in city_inventory]

In [None]:
# adapted from https://spacy.io/usage/rule-based-matching#phrasematcher
from spacy.lang.en import English
en = English()
matcher = PhraseMatcher(en.vocab)
inventories = {'actors': actor_inventory,
           'cities': city_inventory}
for k in inventories:           
    matcher.add(k, # label for the match 
                None, # no custom function for dealing with matches ("callback")
                *[en(text) for text in inventories[k]])

doc = en(f"{actor_inventory[0]} is in {city_inventory[0]}")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    # note: match_id is stored as an index within the vocab
    print(match_id, en.vocab[match_id].text, span.text)

In [None]:
# parse all reviews
parsed_summaries = [d for d in en.pipe(reviews.review_text)]

In [None]:
# get all matches
matches = [matcher(d) for d in parsed_summaries]

In [None]:
# some descriptives about matching
# total reviews with any matched
matched_sum = np.sum([m!=[] for m in matches])
print('% reviews with matched entities:', matched_sum/len(matches))
# average matched per type
inventory_matched = dict([(k, 0) for k in inventories])
for k in inventories:
    for m in matches:
        if m!=[]:
            for mm in m:
                inventory_matched[en.vocab[mm[0]].text]+=1
inventory_matched

In [None]:
parsed_summaries[-1][mm[1]:mm[-1]].start_char

SpaCy's NER training expects observations to be in a certain format: 

`(text, annotations)`

Where annotations is a dictionary.  This dictionary can contain things like part-of-speech, but we'll just be including entities.  Each entity will in in this format:

`(entity start character, entity end character, entity type)`

So each observation will look something like this:

```
(text, 
    {'entities': [
        (start, end, type),
        (start, end, type),
        ...]
        }
    )
```

Additionally, we'll need to deal with entities that have overlap.  We can't feed the model text that has two different entities with overlapping tokens.  So in this case, we just ignore any entities that start within the boundaries of the previous entity.

In [None]:
def format_data(text, matches, excluded=[]):
    # include list for excluded entity text
    training = []
    for i, t in enumerate(text):
        entities = []
        # dealing with overlapping entities
        end = -1
        for m in matches[i]:
            # matches have token idx, need character idx
            st = t[m[1]:m[2]].start_char
            # if start idx is <= end idx, ignore that match
            if st<=end:
                continue
            end = t[m[1]:m[2]].end_char
            ent_type = en.vocab[m[0]].text
            entities.append((st, end, ent_type))
        training.append((t.text, {'entities':entities}))
    return(training)

In [None]:
formatted = format_data(parsed_summaries, matches)

In [None]:
formatted[-1]

In the slides, we'll discuss a couple of considerations for creating a training and a test dataset.  In this case, for simplicity, we'll just split 70/30.

In [None]:
train_pct = 0.7
train_idxs = np.random.random(len(formatted))<=train_pct
train_data = np.array(formatted)[train_idxs]
test_data = np.array(formatted)[~train_idxs]

In [None]:
def train_model(train_data, model_name, labels, early_stop=5, epoch=30, nlp_model=None, last_loss=np.inf):
    if nlp_model is None:
        # initialize model (can pass a trained model to updated)
        nlp_model = English()
        # ner pipeline component
        ner = nlp_model.create_pipe('ner')
        nlp_model.add_pipe(ner)
        # entity types
        for l in labels:
            ner.add_label(l)
        optimizer = nlp_model.begin_training()
    else:
        optimizer = nlp_model.resume_training()
    # from tutorial, creates increasing batch size
    sizes = spacy.util.compounding(1.0, 4.0, 1.001)
    early_stop=5
    last_loss = last_loss
    for itn in range(epoch):
        # random shuffle
        np.random.shuffle(train_data)
        # batch up the examples using spaCy's minibatch
        batches = spacy.util.minibatch(train_data, size=sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp_model.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)
        if last_loss>losses['ner']:
            last_loss = losses['ner']
            print('Saving model')
            with open(model_name+'.pkl', 'wb') as f:
                pickle.dump(nlp_model, f)
        elif early_stop==0:
            print('Stopping iteration')
            break
        else:
            early_stop -= 1
    return(nlp_model)

In [None]:
train_model(train_data[:100], 'review_model', inventories.keys(), epoch=2)

In [None]:
full_model = pickle.load(open('review_model.pkl', 'rb'))
nlp = English()

In [None]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

In [None]:
scorer = Scorer()
for doc, annot in test_data[:100]:
    doc_to_test = full_model(doc)
    gold_text = nlp(doc)
    gold = GoldParse(gold_text, entities=annot.get("entities"))
    scorer.score(doc_to_test, gold)

In [None]:
scorer.scores

In [None]:
for p in parsed_summaries:
    if full_model(p.text).ents!=():
        print(full_model(p.text).ents)

Note a problem here.  It seems like the only actors being identified are male.  That's because I played a bit of a trick.  The inventory I used is all MALE award nominees.  You'd think that that's an oversight no serious designer of DS products wouldn't make.  [You'd think](https://www.wired.com/story/photo-algorithms-id-white-men-fineblack-women-not-so-much/).

This is just a very simple example to show that models learn what you give them.  If I don't provide female nominees, then it won't learn to identify them.