In [1]:
import pandas as pd
import os
import re
from tqdm import notebook, trange, tqdm
import spacy
import ast
import numpy as np
from fuzzywuzzy import fuzz
import itertools
import logging
from seqeval.metrics import f1_score, precision_score, recall_score
import argparse
import glob
import random
from sklearn.model_selection import KFold

spacy.require_gpu()


True

## Loading in Dataset

In [2]:
df = pd.read_csv('../clean_data.csv')
notebook.tqdm.pandas()

  from pandas import Panel


## Spacy

In [3]:
#!python -m spacy download nl_core_news_sm

#nlp = spacy.load('../spacy1')
#nlp = spacy.load('nl_core_news_sm')
#nlp = spacy.load('../spacytest')
#nlp = spacy.load('../d=0.8_distance=80')
nlp = spacy.load('../Spacy models/kfold_3', disable=['parser', 'tagger', 'textcat'])

In [4]:
for ent in nlp(df.text[15]).ents:
    if ent.label_ == 'PERSON':
        print(ent.text, ent.label_)

Jacobus Ligthart PERSON
Lobbetje Hendriks klaeren PERSON
Coenraed Ligthart PERSON
huijbert Jacobs PERSON
Claas Jacobs PERSON
Jakobus Ligthart PERSON
Cobbeje Hendriks PERSON
Hujbert Jacobsen PERSON
Claas Jacobs PERSON


In [5]:
def compare(ner, true, distance):
    tp = 0
    fp = 0
    fn = 0
    check = []
    corrector = []
    for x in true:
        # Check Normal Names
        if x in ner:
            tp += 1
            corrector.append(x)
            continue 

        # Check Removed Last Names 
        for y in ner:
            if type(y) == tuple:
                if y[0] == x:
                    tp += 1
                    corrector.append(y[0])
                    break
                elif y[1] == x:
                    tp += 1
                    corrector.append(y[1])
                    break
                else:
                    if fuzz.ratio(x,y[0]) >= distance:
                        tp += 1
                        check.append((x,y[1]))
                        corrector.append(x)
                        break
                    elif fuzz.ratio(x[1],y) >= distance:
                        tp += 1
                        check.append((x[0],y))
                        corrector.append(x)
                        break

            else:
                if fuzz.ratio(x.lower(),y.lower()) >= distance:
                    tp += 1
                    corrector.append(x)
                    check.append((x,y))
                    break
            
        if x not in corrector:
            fn += 1
    fp = len(ner) - tp
    return tp, fp, fn, check
    

In [15]:
def spacy_name_getter(text, nlp):
    doc = nlp(text)
    holder = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ' ' in ent.text:
            if fuzz.ratio(ent.text, 'Jan Verleij') <= 90:
                holder.append(ent.text)
    return holder
    

In [7]:
def bertje_test(ner_clean):
    holder = []
    prev = None
    for x in ast.literal_eval(ner_clean):
        if x[-1].strip() == 'B-PER':
            holder.append(x[0])
            prev = x[-1].strip()
        elif x[-1].strip() == 'I-PER':
            if prev == 'B-PER' or prev == 'I-PER':
                holder[-1] += ' ' + x[0]
                prev = x[-1].strip()
            else:
                holder.append(x[0])
                prev = x[-1].strip()
        else:
            prev = x[-1].strip()
        
    return holder

In [8]:
def true_name_getter(true):
    holder = []
    for x in ast.literal_eval(true):
        if x['tussenvoegsel'] != None:
            holder.append(x['voornaam'] + " " + x['tussenvoegsel'] + " " + x['achternaam'])
        elif x['voornaam'] and x['achternaam'] != None:
            holder.append(x['voornaam'] + " " + x['achternaam'])
    return holder

In [9]:
def NER_test(row, model, levenshtein, nlp):
    if model == 'SpaCy':
        ner = spacy_name_getter(row.text, nlp)
    if model == 'BERTje':
        ner = bertje_test(row.ner_clean)
    true = true_name_getter(row.namen)
    tp, fp, fn, check = compare(ner, true, levenshtein)
    #return {'tp' : tp, 'fp': fp, 'fn': fn}
    return {'tp' : tp, 'fp': fp, 'fn': fn, 'check': check}

In [16]:
ner_result = df.progress_apply(NER_test, args=('SpaCy', 90, nlp), axis=1)

HBox(children=(IntProgress(value=0, max=13063), HTML(value='')))




In [17]:
total_fp = sum([x['fp'] for x in ner_result])
total_tp = sum([x['tp'] for x in ner_result])
total_fn = sum([x['fn'] for x in ner_result])

In [18]:
total_tp, total_fp, total_fn, total_tp / (total_tp + total_fn), total_tp / (total_tp + total_fp)

(49588, 40886, 15845, 0.7578439013953204, 0.5480911643123991)

## Spacy Tests

### Oud SpaCy model
#### Base SpaCy
TP: 23862  
FP: 190907  
FN: 45972  
Recall: 0.34169602199501675  
Precision: 0.11110542024221373  

#### SpaCy met achternaam removal
TP: 26937  
FP: 187832  
FN: 45074  
Recall: 0.3740678507450251  
Precision: 0.12542312903631345  

#### SpaCy met Levenshtein distance 2 (fuzzy ratio 90) en achternaam removal
TP: 41931  
FP: 173001  
FN: 34235  
Recall: 0.5505212299451199  
Precision: 0.1950896097370331  

#### SpaCy met Levenshtein distance 3 (fuzzy ratio 85) en achternaam removal
TP: 47121  
FP: 167997  
FN: 30955  
Recall: 0.6035273323428454  
Precision: 0.2190472205952082  

#### SpaCy trained model met fuzzy ratio 90
TP: 20186  
FP: 14602  
FN: 45247  
Recall: 0.3084987697339263  
Precision: 0.5802575600781879  

#### SpaCy trained model met fuzzy ratio 80
TP: 22992  
FP: 11796  
FN: 42441  
Recall: 0.3513823300169639  
Precision: 0.6609175577785443  

### Nieuw SpaCy model
#### Base SpaCy

#### SpaCy met Levenshtein distance 2 (fuzzy ratio 90) en achternaam removal
TP: 27190  
FP: 241577  
FN: 38243  
Recall: 0.4155395595494628  
Precision: 0.10116569370495634

#### SpaCy met Levenshtein distance 2 (fuzzy ratio 80) en achternaam removal
TP: 27190  
FP: 241577  
FN: 38243  
Recall: 0.5240322161600416  
Precision: 0.12757890663660346

#### SpaCy ner trained model met fuzzy ratio 80
TP: 28833  
FP: 17218  
FN: 36600  
Recall: 0.44064921369950943  
Precision: 0.6261101821893118

#### SpaCy notaris annotatie fuzzy ratio 90 train 0.1 model met fuzzy ratio 80
TP: 35233  
FP: 20138  
FN: 30200  
Recall: 0.538459187260251  
Precision: 0.6363078145599682

#### SpaCy notaris annotatie fuzzy ratio 80 train 0.1 model met fuzzy ratio 80, dropout rate 0.5
TP: 43444  
FP: 27132  
FN: 21989  
Recall: 0.6639463267770085  
Precision: 0.6155633643164815

#### SpaCy notaris annotatie fuzzy ratio 80 train 0.1 model met fuzzy ratio 80, dropout rate 0.2
TP: 43004  
FP: 25968  
FN: 22429  
Recall: 0.6572218910947075  
Precision: 0.6234993910572406

#### SpaCy notaris annotatie fuzzy ratio 80 train 0.1 model met fuzzy ratio 80, dropout rate 0.8
TP: 48857  
FP: 38664  
FN: 16576  
Recall: 0.7466721684776795  
Precision: 0.5582317386684339

#### SpaCy notaris annotatie fuzzy ratio 80 train 0.5 model met fuzzy ratio 80, dropout rate 0.5
TP: 48857  
FP: 38664  
FN: 16576  
Recall: 0.7894850647144225  
Precision: 0.5719592704486964

#### SpaCy notaris annotatie fuzzy ratio 80 train 0.5 model met fuzzy ratio 90, dropout rate 0.5
TP: 23513  
FP: 21172  
FN: 8860  
Recall: 0.7263151391591759  
Precision: 0.5261944724180374

#### SpaCy notaris annotatie fuzzy ratio 80 train 0.5 model met fuzzy ratio 80, dropout rate 0.8
TP: 51102  
FP: 35873  
FN: 14331  
Recall: 0.7809820732657833  
Precision: 0.587548146018971

## Training Spacy

#### Creating Training Data

In [13]:
raw_train = df.sample(frac=0.01, random_state=0)
nlp = spacy.load('nl_core_news_sm')

In [15]:
def train_data_creator_raw(row, distance):
    doc = nlp(row.text)
    true = true_name_getter(row.namen)
    locs = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ' ' in ent.text:
            #holder.append(ent.text)
            check = False
            corrector = []
            if ent.text in true:
                check = True
            else:
                for x in true:
                    if check == False:
                        if fuzz.ratio(x, ent.text) >= distance:
                            check = True
            if check == True:
                holder = []
                for x in ent:
                    holder.append(x.i)
                locs.append((doc[holder[0]].idx, doc[holder[-1]].idx + len(doc[holder[-1]].text), 'PERSON'))
                    
           
    
    
    return (row.text, {'entities': locs})

def match_neighbour(start, end, true, prev, distance):
    if true == []:
        return (start, end, "PERSON"), prev.i
    if prev.i == len(prev.doc) - 1:
        return (start, end, "PERSON"), prev.i
    if fuzz.ratio(true[0], prev.nbor().text) >= distance:
        return match_neighbour(start, prev.nbor().idx + len(prev.nbor()), true[1:], prev.nbor(), distance)
    else:
        return (start, end, "PERSON"), prev.i

def train_data_creator(row, distance):
    true = true_name_getter(row.namen)
    doc = nlp(row.text)
    locs = []
    prev = 0
    for token in doc:
            for x in true:
                if token.i > prev:
                    split = x.split(' ')
                    if fuzz.ratio(split[0], token.text) >= distance:
                        result, prev = match_neighbour(token.idx, token.idx + len(token), split[1:], token, distance)
                        if result not in locs:
                            locs.append(result)
    return (row.text, {'entities': locs})





In [16]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.

For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding





# @plac.annotations(
#     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
#     output_dir=("Optional output directory", "option", "o", Path),
#     n_iter=("Number of training iterations", "option", "n", int),
# )
def main(train=None, model=None, output_dir=None, n_iter=100, dropout=0.5, start=4.0, end=32.0, compound=1.001, test=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    TRAIN_DATA = train
    if model is not None:
        nlp = spacy.load(model, disable=['parser', 'tagger', 'textcat'])  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in notebook.tqdm(range(n_iter)):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(start, end, compound))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=dropout,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            #print("Losses", losses)

    # test the trained model
    #for text, _ in TRAIN_DATA:
        #doc = nlp(text)
        #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        if type(test) != type(None):
            print("Loading from", output_dir)
            nlp2 = spacy.load(output_dir, disable=['parser', 'tagger', 'textcat'])
            ner_result = test.progress_apply(NER_test, args=('SpaCy', 90, nlp2), axis=1)
            total_fp = sum([x['fp'] for x in ner_result])
            total_tp = sum([x['tp'] for x in ner_result])
            total_fn = sum([x['fn'] for x in ner_result])
            print('Recall: ' + str(total_tp / (total_tp + total_fn)))
            print('Precision: ' + str(total_tp / (total_tp + total_fp)))

In [17]:
nlp = spacy.load('nl_core_news_sm', disable=['parser', 'tagger', 'textcat'])
train = [train_data_creator(x, 80) for x in notebook.tqdm(df.itertuples(),  total=df.shape[0])]


HBox(children=(IntProgress(value=0, max=13063), HTML(value='')))

KeyboardInterrupt: 

In [98]:
type(None) != type(None)

False

In [84]:
df['traindata'] = train
c= 0
for train_index, test_index in kf.split(df):
    c += 1
    df.loc[train_index].to_json('train_kfold_' + str(c) + '.json')
    df.loc[test_index].to_json('test_kfold' + str(c)+ '.json')

In [108]:
train = pd.read_json('train_kfold_1.json').traindata.tolist()
test = pd.read_json('test_kfold1.json')
#main(train=train, model='nl_core_news_sm', output_dir='../kfold_' + '1', dropout=0.5, test=test)
if type(test) != type(None):
            print("Loading from", '../kfold_' + '1')
            nlp2 = spacy.load('../kfold_' + '1', disable=['parser', 'tagger', 'textcat'])
            ner_result = test.progress_apply(NER_test, args=('SpaCy', 90, nlp2), axis=1)
            total_fp = sum([x['fp'] for x in ner_result])
            total_tp = sum([x['tp'] for x in ner_result])
            total_fn = sum([x['fn'] for x in ner_result])
            print('Recall: ' + str(total_tp / (total_tp + total_fn)))
            print('Precision: ' + str(total_tp / (total_tp + total_fp)))


Loading from ../kfold_1


HBox(children=(IntProgress(value=0, max=1307), HTML(value='')))


Recall: 0.694800301431801
Precision: 0.5690655474632761


In [112]:
train = pd.read_json('train_kfold_2.json').traindata.tolist()
test = pd.read_json('test_kfold2.json')
#main(train=train, model='nl_core_news_sm', output_dir='../kfold_' + '2', dropout=0.5, test=test)
if type(test) != type(None):
            print("Loading from", '../kfold_' + '2')
            nlp2 = spacy.load('../kfold_' + '2', disable=['parser', 'tagger', 'textcat'])
            ner_result = test.progress_apply(NER_test, args=('SpaCy', 90, nlp2), axis=1)
            total_fp = sum([x['fp'] for x in ner_result])
            total_tp = sum([x['tp'] for x in ner_result])
            total_fn = sum([x['fn'] for x in ner_result])
            print('Recall: ' + str(total_tp / (total_tp + total_fn)))
            print('Precision: ' + str(total_tp / (total_tp + total_fp)))


Loading from ../kfold_2


HBox(children=(IntProgress(value=0, max=1307), HTML(value='')))


Recall: 0.7323612417685795
Precision: 0.5351741521539871


In [113]:
train = pd.read_json('train_kfold_3.json').traindata.tolist()
test = pd.read_json('test_kfold3.json')
main(train=train, model='nl_core_news_sm', output_dir='../kfold_' + '3', dropout=0.5, test=test)

Loaded model 'nl_core_news_sm'


HBox(children=(IntProgress(value=0), HTML(value='')))


Saved model to ..\kfold_3
Loading from ..\kfold_3


HBox(children=(IntProgress(value=0, max=1307), HTML(value='')))


Recall: 0.7527012631258561
Precision: 0.5349340255245512


In [10]:
train = pd.read_json('train_kfold_4.json').traindata.tolist()
test = pd.read_json('test_kfold4.json')
main(train=train, model='nl_core_news_sm', output_dir='../kfold_' + '4', dropout=0.5, test=test)

Loaded model 'nl_core_news_sm'


HBox(children=(IntProgress(value=0), HTML(value='')))


Saved model to ..\kfold_4
Loading from ..\kfold_4


HBox(children=(IntProgress(value=0, max=1306), HTML(value='')))


Recall: 0.7279757391963608
Precision: 0.5448252383113935


In [11]:
train = pd.read_json('train_kfold_5.json').traindata.tolist()
test = pd.read_json('test_kfold5.json')
main(train=train, model='nl_core_news_sm', output_dir='../kfold_' + '5', dropout=0.5, test=test)

Loaded model 'nl_core_news_sm'


HBox(children=(IntProgress(value=0), HTML(value='')))


Saved model to ..\kfold_5
Loading from ..\kfold_5


HBox(children=(IntProgress(value=0, max=1306), HTML(value='')))


Recall: 0.7403654230001535
Precision: 0.5438141423254765


## Folds

#### 1
Recall: 0.694800301431801  
Precision: 0.5690655474632761

#### 2
Recall: 0.7323612417685795  
Precision: 0.5351741521539871

#### 3
Recall: 0.7527012631258561  
Precision: 0.5349340255245512

#### 4
Recall: 0.7279757391963608  
Precision: 0.5448252383113935

#### 5
Recall: 0.7403654230001535  
Precision: 0.5438141423254765

In [None]:
main(train=, model='nl_core_news_sm', output_dir='../d=0.8_distance=80_t=0.5', dropout=0.8, test=)

## Structured data test

In [32]:
def testje(row, distance):
    true = true_name_getter(row.namen)
    doc = nlp(row.text)
    locs = []
    for x in true:
        for token in doc:
            split = x.split(' ')
            if fuzz.ratio(split[0], token.text) >= distance:
                result, prev = match_neighbour(token.idx, token.idx + len(token), split[1:], token, distance)
                entity = row.text[result[0]:result[1]]
                if fuzz.partial_ratio(entity, x) >= 90 and x not in locs:
                    locs.append(entity)
    return (true, locs)


In [33]:
testjes = [testje(x, 80) for x in notebook.tqdm(raw_train.itertuples())]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [31]:
sum(testjes) / len(testjes)

0.881745410425298

In [34]:
testjes

[(['Daniel de Harde', 'Anne Jacobsz de Vries', 'Volkert de Vries'],
  ['Daniel de harde', 'Anne Jacobsz', 'Anne Jacobsz', 'Volkert de Vries']),
 (['Sipke Geerts', 'Nanning ...', 'Ernst de Waal', '... Schenk'],
  ['Sipke geerts',
   'Sipke',
   'Nanning',
   'Nanning',
   'Nanning',
   'Erust de Waal',
   'Erust de Waal',
   'Ernst de Waal']),
 (['Pibe Pibesz Lankelma', 'Jan Dominicus', 'Jan Backer'],
  ['Pibe Pibesz Lankelma',
   'Jan',
   'Jan',
   'Jan Dominicus',
   'Jan',
   'Jan Backer']),
 (['Lucas Meijer',
   'Gerrit Meijer',
   'Casper Eijgenberg',
   'Margareta Horstman',
   'Gerrit Groothuijsen',
   'Simonis Hogenberg'],
  ['Lucas Meijer',
   'Gerrit',
   'gerrit Meijer',
   'Gerrit',
   'Casper Eijgenberg',
   'Margareta horstman',
   'Gerrit groothuijsen',
   'Gerrit groothuijsen',
   'Simonis hogenberg']),
 (['Pieter Gerritsz Sluijs',
   'Jacob Gerritsz',
   'Marritje Pieters Sluijs',
   'Antje Waerberg',
   'Jan Gerritsz',
   'Elisabeth Waerberg',
   'Gerritje Waerberg',
