In [1]:
import os
from tqdm.notebook import tqdm
import spacy
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.util import compounding, minibatch
from random import Random

For each excerpts we want data from three different files.  The .tags file has the named entity recognition tags that we are building the model to find.  The en.tok.off file has the positions of the tags given as the index of their first and last character.  The .raw file has the complete excerpts.

The files have the same name besides their extensions so we can crawl the directory for any of the three to make a complete list.  Let's do .raw.

In [2]:
dirName = "../gmb-2.2.0/data/"

listOfFiles = list()
for (dirpath, dirnames, filenames) in os.walk(dirName):
    for file in filenames:
        if ".raw" in file:
            listOfFiles += [os.path.join(dirpath, file)]


We need to reformat the NER tags.  The Groningen Meaning Bank labels each token individually in a multi-token named entity.  They need to be combined for SpaCy.  Additionally the "O" labels, which indicate a word isn't in any of the named entity categories, can be discarded.

In [3]:
STOP_WORD = "O"

def get_consecutives(labels, offsets):
    """This generator takes the token labels and token offsets as two lists.
    Consecutive identical labels are combined to match the SpaCy formatting.
    The indexes of the found labels, as well as the labels themselves are yielded."""
    
    start=end=0
    previous=STOP_WORD
    while labels:
        head = labels.pop(0)
        offset = offsets.pop(0).split()
        
        if head!=previous and previous!=STOP_WORD:
            yield((int(start), int(end), previous))
            
        if head!=STOP_WORD:
            end=offset[1]
                
        if previous!=head!=STOP_WORD:
            start=offset[0]
        previous=head
        
    if head!=STOP_WORD:
        yield((int(start), int(end), head))
    

"""Now we iterate through the previously compiled list of files, 
opening the tags and offsets and feeding them into the generator."""
DATA = list()

#tqdm just gives us a progress bar on the for loop
for raw_file in tqdm(listOfFiles):
    with open(raw_file) as raw_text_file:
        text = raw_text_file.read().replace('\n', ' ')
        
    tag_file = "../"+raw_file.strip(".raw")+".tags"
    with open(tag_file) as tag_data:
        tags_by_line = tag_data.readlines()
        # The following checks the length so any empty lines in the file get skipped
        ner_labels = [ line.split("\t")[3] for line in tags_by_line if len(line.split("\t"))>2 ]
        
        
    offsets_file = "../"+raw_file.strip(".raw") + ".tok.off"
    with open(offsets_file) as offsets_file:
        offsets = offsets_file.readlines()

    DATA.append((text, list(get_consecutives(ner_labels, offsets))))
    
NER_LABELS = set(ner[2] for line in DATA  for ner in line[1])

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




The data is loaded, let's take a look at a few lines as a sanity check and see how many of each named entity label there are.

In [4]:
def DF_generator(data):
    for i, line in enumerate(data):
        for ner in line[1]:
            yield ner[2], line[0][ner[0]:ner[1]], i, line[0]

df = pd.DataFrame(DF_generator(DATA), columns=['label', 'ner', 'paragraph_index', 'paragraph'])

display(df.head())
display(df.sample(10).sort_index())
display(df.label.value_counts())

Unnamed: 0,label,ner,paragraph_index,paragraph
0,geo-nam,London,0,Medical experts from London have published a p...
1,geo-nam,United States,1,The United States is calling on Bangladesh to ...
2,geo-nam,Bangladesh,1,The United States is calling on Bangladesh to ...
3,org-nam,Muslim Rohingya,1,The United States is calling on Bangladesh to ...
4,geo-nam,Burma,1,The United States is calling on Bangladesh to ...


Unnamed: 0,label,ner,paragraph_index,paragraph
33052,gpe-nam,Palestinian,2049,Two top Palestinian militant leaders have been...
34070,geo-nam,Pakistan,2114,The United States' only mobile field hospital ...
54368,per-fam,Abdullah,3368,Saudi Arabia has taken a firm stance with Syri...
73823,tim-dow,Tuesday,4561,Belarus's Foreign Ministry has dismissed the l...
77455,per-nam,Kibaki,4780,Thousands of prisoners in Kenya reportedly vol...
79747,tim-yoc,2005,4929,Sudan's President Omar al-Bashir has announced...
103806,per-nam,Charm Tong,6406,President Bush met at the White House Monday w...
127855,tim-moy,August,7855,"A Vietnamese man has died from bird flu, raisi..."
143223,org-nam,Kempthorne,8786,U.S. authorities started a man-made flood in t...
154951,geo-nam,Europe,9512,People around the world are taking part in mem...


geo-nam    48875
org-nam    26195
gpe-nam    20436
per-nam    14962
tim-dow    11402
tim-dat     9663
per-tit     9163
per-fam     8044
tim-yoc     5290
tim-moy     4261
per-giv     2341
tim-clo      726
art-nam      501
eve-nam      290
nat-nam      238
tim-nam      134
eve-ord      107
org-leg       59
per-ini       55
per-ord       38
tim-dom       10
art-add        1
per-mid        1
Name: label, dtype: int64

Now that we have the data loaded up, let's split it into training and testing datasets.  Also because there is a large amount of data let's stick some of it in a holdout dataset just to speed up the process.  We aren't actually using the holdout dataset in this case but it's large enough that we could test against slices of it as a final step if we were so inclined.

SpaCy recommends at least 500 or so examples for a label.  Let's cut out anything with less then 5000 just to reduce the amount of training we need.  This "trimmer" removes the other labels from the data.

In [5]:
tallies = df.label.value_counts()
included_labels = tallies[tallies>5000]

def trimmer(data, included_labels_):    
    for line in data:
        paragraph = line[0]
        ners = [match for match in line[1] if match[2] in included_labels_]
        yield(paragraph, {"entities": ners})


train_data = list(trimmer(DATA[:], list(included_labels.index)))
train_data, holdout_data = train_test_split(train_data, train_size=0.6)
train_data, test_data = train_test_split(train_data, train_size=0.6)

Data is ready and we instantiate a new SpaCy model to hold the named entity recognizer and assign it the labels to train on.

In [6]:
nlp = spacy.blank('en')
ner = nlp.create_pipe("ner")

for label in included_labels.index:
    ner.add_label(label)

nlp.add_pipe(ner)

In [7]:
def train_ner(nlp, train_data, n=10, batch_size=8):
    optimizer = nlp.begin_training()
    losses = []

    # tqdm gives progress bar, otherwise this is just a for loop 
    iterations = tqdm(range(n))
    for i in iterations:
        data_shuffle = train_data[:]
        Random(i).shuffle(data_shuffle)
        # using i as a random seed so the results will be reproducable
        loss = {}
        batches = minibatch(data_shuffle, size=batch_size)
        n_iterations = -(-len(train_data)//batch_size) # progress bar length
        inner_loop = tqdm(batches, total=n_iterations, leave=False)
        for batch in inner_loop:
            text, annotations = zip(*batch)
            nlp.update(text, annotations, sgd=optimizer, drop=0.25, losses=loss)

        # display the most recent loss value
        iterations.set_description("Loss: " + str([round(value,2) for value in loss.values()]))
        losses.append(loss)

    # show the losses over time so we know if we stopped learning before the end
    display(losses)
    
train_ner(nlp, train_data)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))

HBox(children=(IntProgress(value=0, max=450), HTML(value='')))




[{'ner': 57266.19267361566},
 {'ner': 30280.414518548645},
 {'ner': 25568.995411677228},
 {'ner': 23502.255086849887},
 {'ner': 21990.28781383906},
 {'ner': 20800.14545613487},
 {'ner': 20216.874722369786},
 {'ner': 19200.00733258798},
 {'ner': 18528.596530127033},
 {'ner': 17936.92538771001}]

In [8]:
nlp.to_disk("./exported_model")

# nlp = nlp.from_disk("./exported_model")
# optimizer = nlp.resume_training()

In [9]:
def ner_evaluation(model, test_data):
    """This evaluates the precision and recall scores in two ways.  The phrase score is
    for an exact match between the labeled string and the correct string.  The label
    score is based on whether the identified label was actually present in the sentence."""
    
    
    label_tp = label_fp = label_fn = phrase_tp = phrase_fp = phrase_fn = 0
    
    for test in tqdm(test_data):
        text = test[0]
        ners = test[1]['entities']
        true_phrases = [text[ner[0]:ner[1]] for ner in ners]
        true_labels = [ner[2] for ner in ners]
        
        found_phrases = [ ent.text for ent in model(text).ents ]
        found_labels = [ ent.label_ for ent in model(text).ents ]
        correct = [ner[2] for ner in ners]
        
        for phrase in found_phrases:
            if phrase in true_phrases:
                phrase_tp += 1
            else:
                phrase_fp += 1
        
        for phrase in found_phrases:
            if phrase not in found_phrases:
                phrase_fn += 0
            
        for ent in found_labels:
            if ent in true_labels:
                label_tp += 1
            else:
                label_fp += 1
        
        for ner in true_labels:
            if ner not in found_labels:
                label_fn += 0
            
    results = {}
    
    results['phrase_precision'] = phrase_tp / (phrase_tp + phrase_fp)
    results['phrase_recall'] = phrase_tp / (phrase_tp + phrase_fn)
    results['label_precision'] = label_tp / (label_tp + label_fp)
    results['label_recall'] = label_tp / (label_tp + label_fn)
    
    return results


In [10]:
ner_evaluation(nlp, test_data)

HBox(children=(IntProgress(value=0, max=2400), HTML(value='')))




{'phrase_precision': 0.9221655948982043,
 'phrase_recall': 1.0,
 'label_precision': 0.9731035034821357,
 'label_recall': 1.0}