In [4]:
import pandas as pd
import os
import re
from tqdm import notebook
import spacy
import ast
import numpy as np


## Loading in Dataset

In [5]:
df = pd.read_csv('../clean_data.csv')
notebook.tqdm.pandas()

  from pandas import Panel


## Spacy

In [15]:
#!python -m spacy download nl_core_news_sm
nlp = spacy.load('nl_core_news_sm')

In [30]:
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return distance[row][col]

In [50]:
def compare(ner, true, distance):
    tp = 0
    fp = 0
    fn = 0
    check = []
    corrector = []
    for x in ner:
        # Check Removed Last Names
        if type(x) == tuple:
            if x[0] in true:
                tp += 1
                corrector.append(x[0])
            elif x[1] in true:
                tp += 1
                corrector.append(x[1])
            else:
                switch = True
                for y in true:
                    if levenshtein_ratio_and_distance(x[0].lower(),y.lower()) <= distance:
                        tp += 1
                        switch = False
                        check.append((x[0],y))
                        corrector.append(y)
                    elif levenshtein_ratio_and_distance(x[1].lower(),y.lower()) <= distance:
                        tp += 1
                        switch = False
                        check.append((x[1],y))
                        corrector.append(y)
                if switch == True:
                    fp += 1
        
        # Check Normal Names
        elif x in true:
            tp += 1
            corrector.append(x)
        else:
            switch = True
            for y in true:
                if levenshtein_ratio_and_distance(x.lower(),y.lower()) <= distance:
                    tp += 1
                    switch = False
                    corrector.append(y)
                    check.append((x,y))
            if switch == True:
                fp += 1
            
    for x in true:
        if x not in corrector:
            fn += 1
    return tp, fp, fn, check
    

In [51]:
def spacy_test(text):
    doc = nlp(text)
    holder = []
    for ent in doc.ents:
        if ent.label_ == 'PER' and ' ' in ent.text:
            if re.search("[A-Z][a-z ]+[A-Z]\w+ [A-Z]\w+", ent.text) != None:
                holder.append((ent.text, re.sub(' [A-Z]\w+$', '', ent.text)))
            else:
                holder.append(ent.text)
    return holder
    

In [52]:
def true_name_getter(true):
    holder = []
    for x in ast.literal_eval(true):
        if x['tussenvoegsel'] != None:
            holder.append(x['voornaam'] + " " + x['tussenvoegsel'] + " " + x['achternaam'])
        elif x['voornaam'] and x['achternaam'] != None:
            holder.append(x['voornaam'] + " " + x['achternaam'])
    return holder

In [63]:
def NER_test(row):
    ner = spacy_test(row.text)
    true = true_name_getter(row.namen)
    tp, fp, fn, check = compare(ner, true, 3)
    #return {'tp' : tp, 'fp': fp, 'fn': fn}
    return {'tp' : tp, 'fp': fp, 'fn': fn, 'check': check}

In [64]:
ner_result = df.progress_apply(NER_test, axis=1)

HBox(children=(IntProgress(value=0, max=13063), HTML(value='')))




In [60]:
total_fp = sum([x['fp'] for x in ner_result])
total_tp = sum([x['tp'] for x in ner_result])
total_fn = sum([x['fn'] for x in ner_result])

In [61]:
total_fp,total_tp,total_fn, total_tp / (total_tp + total_fp), total_tp / (total_tp + total_fn)

(167997, 47121, 30955, 0.2190472205952082, 0.6035273323428454)

### Spacy Tests

#### Base SpaCy
TP: 23862  
FP: 190907  
FN: 45972  
Recall: 0.34169602199501675  
Precision: 0.11110542024221373  

#### SpaCy met achternaam removal
TP: 26937  
FP: 187832  
FN: 45074  
Recall: 0.3740678507450251  
Precision: 0.12542312903631345  

#### SpaCy met Levenshtein distance 2 en achternaam removal
TP: 41931  
FP: 173001  
FN: 34235  
Recall: 0.5505212299451199  
Precision: 0.1950896097370331  

#### SpaCy met Levenshtein distance 3 en achternaam removal
TP: 47121  
FP: 167997  
FN: 30955  
Recall: 0.6035273323428454  
Precision: 0.2190472205952082  

In [29]:
NER_test(df.iloc[0])

{'tp': 3, 'fp': 13, 'fn': 1}

In [67]:
ner_result[1]['check']

[('Cornelis van Omneren', 'Cornelis van Ommeren'),
 ('Anna Johanna Roman', 'Anna Johanna Romano'),
 ('Maurina Roman', 'Maurina Romano')]