In [13]:
import os
from tqdm.notebook import tqdm
import spacy
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.util import compounding, minibatch

In [2]:
# .tags has the ner labels
# en.tok.off has the word positions
# they need to be combined to be formatted the way that spacy takes them


dirName = "../gmb-2.2.0/data/"

listOfFiles = list()
for (dirpath, dirnames, filenames) in os.walk(dirName):
    for file in filenames:
        if ".raw" in file:
            listOfFiles += [os.path.join(dirpath, file)]


In [5]:
# I want to combine the ner data and position data but discard the rest for now

STOP_WORD = "O"

def get_consecutives(labels, offsets):
    """Groningen labels each NER token individually while spacy combines them.  This generator gets the indexes
    for combining the NER labels"""
    start=end=0
    previous=STOP_WORD
    while labels:
        head = labels.pop(0)
        offset = offsets.pop(0).split()
        
        
        if head!=previous and previous!=STOP_WORD:
            yield((int(start), int(end), previous))
            
        if head!=STOP_WORD:
            end=offset[1]
                
        if previous!=head!=STOP_WORD:
            start=offset[0]
        previous=head
    

TRAIN_DATA = list()

for raw_file in tqdm(listOfFiles):
    with open(raw_file) as raw_text_file:
        text = raw_text_file.read().replace('\n', ' ')
        
    tag_file = "../"+raw_file.strip(".raw")+".tags"
    with open(tag_file) as tag_data:
        tags_by_line = tag_data.readlines()
        ner_labels = [ line.split("\t")[3] for line in tags_by_line if len(line.split("\t"))>2 ]
        
    offsets_file = "../"+raw_file.strip(".raw") + ".tok.off"
    with open(offsets_file) as offsets_file:
        offsets = offsets_file.readlines()

    TRAIN_DATA.append((text, list(get_consecutives(ner_labels, offsets))))
    
NER_LABELS = set(ner[2] for line in TRAIN_DATA  for ner in line[1])

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [4]:
###################################

In [10]:
def DF_generator(DATA):
    for i, line in enumerate(DATA):
        for ner in line[1]:
            yield ner[2], line[0][ner[0]:ner[1]], i, line[0]

df = pd.DataFrame(DF_generator(TRAIN_DATA), columns=['label', 'ner', 'paragraph_index', 'paragraph'])

display(df.head())
display(df.label.value_counts())

Unnamed: 0,label,ner,paragraph_index,paragraph
0,per-giv,Benedict,0,Pope Benedict XVI has visited the Italian city...
1,per-nam,XVI,0,Pope Benedict XVI has visited the Italian city...
2,gpe-nam,Italian,0,Pope Benedict XVI has visited the Italian city...
3,geo-nam,L'Aquilla,0,Pope Benedict XVI has visited the Italian city...
4,gpe-nam,Italian,0,Pope Benedict XVI has visited the Italian city...


geo-nam    48866
org-nam    26190
gpe-nam    20434
per-nam    14962
tim-dow    11401
tim-dat     9662
per-tit     9163
per-fam     8044
tim-yoc     5287
tim-moy     4261
per-giv     2341
tim-clo      726
art-nam      501
eve-nam      290
nat-nam      238
tim-nam      134
eve-ord      107
org-leg       59
per-ini       55
per-ord       38
tim-dom       10
art-add        1
per-mid        1
Name: label, dtype: int64

In [8]:
nlp = spacy.blank('en')
ner = nlp.create_pipe("ner")

tallies = df.label.value_counts()
included_labels = tallies[tallies>5000]
for label in included_labels.index:
    ner.add_label(label)

nlp.add_pipe(ner)

def trimmer(train_data, keep):
    
    for line in train_data:
        paragraph = line[0]
        ners = [match for match in line[1] if match[2] in keep]
        yield(paragraph, {"entities": ners})


train_data = list(trimmer(TRAIN_DATA[:], list(included_labels.index)))
train_data, holdout_data = train_test_split(train_data, train_size=0.6)
train_data, test_data = train_test_split(train_data, train_size=0.6)

optimizer = nlp.begin_training()

In [28]:
sizes = compounding(1.0, 4.0, 1.001)
iterations = tqdm(range(15), position=0)
for _ in iterations:
    random.shuffle(train_data)
    batches = minibatch(train_data, size=sizes)
    losses = {}
    inner_loop = tqdm(list(batches), leave=False, position=1)
    for batch in inner_loop:
        text, annotations = zip(*batch)
        nlp.update(text, annotations, sgd=optimizer, drop=0.35, losses=losses)
    iterations.set_description("Loss: " + str([round(value,2) for value in losses.values()]))

In [9]:
# nlp.to_disk("./exported_model")

nlp = nlp.from_disk("./exported_model")
optimizer = nlp.resume_training()

In [29]:
def ner_evaluation(model, test_data):
    label_tp = label_fp = label_fn = phrase_tp = phrase_fp = phrase_fn = 0
    
    true_p = 0
    false_p = 0
    false_n = 0
    
    for test in tqdm(test_data):
        text = test[0]
        ners = test[1]['entities']
        true_phrases = [text[ner[0]:ner[1]] for ner in ners]
        true_labels = [ner[2] for ner in ners]
        
        found_phrases = [ ent.text for ent in model(text).ents ]
        found_labels = [ ent.label_ for ent in model(text).ents ]
        correct = [ner[2] for ner in ners]
        
        for phrase in found_phrases:
            if phrase in true_phrases:
                phrase_tp += 1
            else:
                phrase_fp += 1
        
        for phrase in found_phrases:
            if phrase not in found_phrases:
                phrase_fn += 0
            
        for ent in found_labels:
            if ent in true_labels:
                label_tp += 1
            else:
                label_fp += 1
        
        for ner in true_labels:
            if ner not in found_labels:
                label_fn += 0
            
    results = {}
    
    results['phrase_precision'] = phrase_tp / (phrase_tp + phrase_fp)
    results['phrase_recall'] = phrase_tp / (phrase_tp + phrase_fn)
    results['label_precision'] = label_tp / (label_tp + label_fp)
    results['label_recall'] = label_tp / (label_tp + label_fn)
    
    return results

In [None]:
ner_evaluation(nlp, test_data)

HBox(children=(IntProgress(value=0, max=2400), HTML(value='')))