In [1]:
import pandas as pd
import pandas as pd
import numpy as np
from tqdm import notebook
import ast
import re
from fuzzywuzzy import fuzz
from datetime import datetime, timedelta
import spacy
from sklearn.model_selection import train_test_split

In [None]:
def fleiss_kappa(ratings, n):
    '''
    Computes the Fleiss' kappa measure for assessing the reliability of 
    agreement between a fixed number n of raters when assigning categorical
    ratings to a number of items.
    
    Args:
        ratings: a list of (item, category)-ratings
        n: number of raters
        k: number of categories
    Returns:
        the Fleiss' kappa score
    
    See also:
        http://en.wikipedia.org/wiki/Fleiss'_kappa
    '''
    items = set()
    categories = set()
    n_ij = {}
    
    for i, c in ratings:
        items.add(i)
        categories.add(c)
        n_ij[(i,c)] = n_ij.get((i,c), 0) + 1
    
    N = len(items)
    
    p_j = dict(((c, sum(n_ij.get((i, c), 0) for i in items) / (1.0 * n * N)) for c in categories))
    P_i = dict(((i, (sum(n_ij.get((i, c), 0) ** 2 for c in categories) - n) / (n * (n - 1.0))) for i in items))

    P_bar = sum(P_i.values()) / (1.0 * N)
    P_e_bar = sum(value ** 2 for value in p_j.values())
    
    kappa = (P_bar - P_e_bar) / (1 - P_e_bar)
    
    return kappa

In [None]:
# ratings = [(1, 'yes')] * 10 + [(1, 'no')] * 0  + \
# [(2, 'yes')] * 8  + [(2, 'no')] * 2  + \
# [(3, 'yes')] * 9  + [(3, 'no')] * 1  + \
# [(4, 'yes')] * 0  + [(4, 'no')] * 10 + \
# [(5, 'yes')] * 7  + [(5, 'no')] * 3

fleiss_kappa(ratings, 4)


In [None]:
test_pat = pd.read_json('../../Annotation/Patrick/test_batch_patrick.json')

In [None]:
test_chiel = pd.read_json('../../Annotation/Chiel/test_batch_chiel.json')

In [None]:
ratings = []
for x in range(len(subset2_bar[0:554])):
    if subset2_bar.iloc[x].vocop_match == 0:
        ratings.append((x + 1, 'no'))
    else:
        ratings.append((x + 1, 'yes'))
    if subset2_bar.iloc[x].vocop_match == 0:
        ratings.append((x + 1, 'no'))
    else:
        ratings.append((x + 1, 'yes'))
    if test_pat.iloc[x].vocop_match == 0:
        ratings.append((x + 1, 'no'))
    else:
        ratings.append((x + 1, 'yes'))
    if test_chiel.iloc[x].vocop_match == 0:
        ratings.append((x + 1, 'no'))
    else:
        ratings.append((x + 1, 'yes'))

In [None]:
len(ratings)

In [None]:
((6**2) + (548**2) - 554) / (554 * (554 - 1))

In [None]:
pie = 0.9785351969239005 + 0.9785351969239005# + 0.971
pie = pie/2
pie

In [None]:
pce = ((6 + 6) / (2* 554)) ** 2 + ((548 + 548) / (2* 554)) ** 2 
print(pce)
(pie - pce) / (1-pce)

In [2]:
path = 'Completed Batches/'
subset1_pat = pd.read_json(path + 'batch1_patrick_result.json')
subset2_pat = pd.read_json(path + 'batch2_patrick_result.json')
subset3_pat = pd.read_json(path + 'batch3_patrick_result.json')
subset4_pat = pd.read_json(path + 'batch4_patrick_result.json')
subset5_pat = pd.read_json(path + 'batch5_patrick_result.json')
subset1_thom = pd.read_json(path + 'batch1_thom_result.json')
subset1_chiel = pd.read_json(path + 'batch1_chiel_result.json')
subset2_chiel = pd.read_json(path + 'batch2_chiel_result.json')
subset3_chiel = pd.read_json(path + 'batch3_chiel_result.json')
subset1_bar = pd.read_json(path + 'batch1_result.json')
subset2_bar = pd.read_json(path + 'result.json')
subset3_bar = pd.read_json(path + 'batch2_result.json')
subset4_bar = pd.read_json(path + 'batch3_result.json')
subset5_bar = pd.read_json(path + 'batch4_result.json')
subset6_bar = pd.read_json(path + 'batch2_thom_result.json')

In [3]:
len(subset2_bar[subset2_bar.vocop_match == 0])

991

#### Correcties:
Chiel:
- Christiaan Andriesz 1416
- Jan de Bruijn 1463
- Abraham van de Heuvel (1823 super interessant twijfel geval)
- Pieter van Essen (1825 super interessant twijfel geval)
- Hans Pietersz 3792

Patrick:
- Jan Smit 4194 (zelde als de interessante gevallen van Chiel)
- Hendrik van Alen 593 (zelfde schip maar naam is net even anders)

In [4]:
subset1_pat.at[593, 'vocop_match'] = 0
subset1_chiel.at[1416,'vocop_match'] = 0
subset1_chiel.at[1463,'vocop_match'] = 0
subset2_chiel.at[3729, 'vocop_match'] = 0
subset4_bar.at[9025, 'vocop_match'] = ('Johannis Oosterhoff', 140442)
subset5_bar.at[9927, 'vocop_match'] = ('Wiggert Andresz', 374198)
subset6_bar.at[2670, 'vocop_match'] = ('Johan Christiaan Richter', 379155)
subset6_bar.at[2688, 'vocop_match'] = ('Jacob Meijer', 178549)
subset6_bar.at[2689, 'vocop_match'] = ('Juriaan Bartels', 178548)
subset6_bar.at[2690, 'vocop_match'] = ('Willem Borsenius', 283179)
subset6_bar.at[2691, 'vocop_match'] = ('Jan de Vaij', 182941)

In [5]:
final_pat = subset1_pat
final_pat = final_pat.append(subset2_pat)
final_pat = final_pat.append(subset3_pat).append(subset4_pat).append(subset5_pat)

final_chiel = subset1_chiel
final_chiel = final_chiel.append(subset2_chiel)
final_chiel = final_chiel.append(subset3_chiel)

final_barry = subset1_bar
final_barry = final_barry.append(subset2_bar).append(subset3_bar).append(subset4_bar).append(subset5_bar).append(subset6_bar)

final_df = final_pat.append(final_chiel).append(subset1_thom).append(final_barry)

In [6]:
# final_df = final_df.loc[~final_df.index.duplicated(keep='first')]
# final_df.index.name = 'index'
# final_df.to_csv('../final_df.csv', index=True)
final_df[final_df.vocop_match != 0]
final_df[final_df.uuid == '1bf4e148-14a4-dd4d-ef24-503a157766dc']

Unnamed: 0,uuid,rubriek,notaris,inventarisNr,akteNr,akteType,datering,taal,beschrijving,namen,urls,text,name,data_matches,data_entry,vocop_match
5963,1bf4e148-14a4-dd4d-ef24-503a157766dc,358,JAN VERLEIJ,11960,21944,Machtiging,1766-08-25,nederlands,"\nVOC schip Damzigt, schip Walcheren, innen va...","[{'voornaam': 'Pieter', 'tussenvoegsel': 'van'...","['KLAB06468000354.JPG', 'KLAB06468000355.JPG',...",an No: 215 Procuratie gecasseerd den 25e: aug:...,Pieter van Kerkwijk,0,0,0
5964,1bf4e148-14a4-dd4d-ef24-503a157766dc,358,JAN VERLEIJ,11960,21944,Machtiging,1766-08-25,nederlands,"\nVOC schip Damzigt, schip Walcheren, innen va...","[{'voornaam': 'Pieter', 'tussenvoegsel': 'van'...","['KLAB06468000354.JPG', 'KLAB06468000355.JPG',...",an No: 215 Procuratie gecasseerd den 25e: aug:...,Hendrik Dames,"[{'index': 55709, 'name_original': 'Hendrik Da...","{'55709': {'ships': [], 'rank': [], 'location'...","[Hendrik Dames, 80776]"
5965,1bf4e148-14a4-dd4d-ef24-503a157766dc,358,JAN VERLEIJ,11960,21944,Machtiging,1766-08-25,nederlands,"\nVOC schip Damzigt, schip Walcheren, innen va...","[{'voornaam': 'Pieter', 'tussenvoegsel': 'van'...","['KLAB06468000354.JPG', 'KLAB06468000355.JPG',...",an No: 215 Procuratie gecasseerd den 25e: aug:...,Willem de Kemp,0,0,0


In [None]:
voc.iloc[80776]

In [None]:
voc[voc.VOCOP_id == 422482]

In [7]:
nlp = spacy.load('nl_core_news_sm', disable=['parser', 'tagger', 'textcat'])

In [None]:
def match_neighbour(start, end, true, prev, distance):
    if true == []:
        return (start, end), prev.i
    if prev.i == len(prev.doc) - 1:
        return (start, end), prev.i
    if fuzz.ratio(true[0].lower(), prev.nbor().text.lower()) >= distance:
        return match_neighbour(start, prev.nbor().idx + len(prev.nbor()), true[1:], prev.nbor(), distance)
    else:
        return (start, end), prev.i

def match_finder(row, match, distance):

    true = match
    doc = nlp(row.text)
    locs = []
    prev = 0
    for token in doc:
            for x in true:
                if token.i > prev and type(x) == str:
                    split = x.split(' ')
                    if fuzz.ratio(split[0].lower(), token.text.lower()) >= distance:
                        result, prev = match_neighbour(token.idx, token.idx + len(token), split[1:], token, distance)
                        if result not in locs:
                            if fuzz.ratio(row.text[result[0]:result[1]], x) >= distance:
                                locs.append(result)
    entities = [row.text[x[0]:x[1]] for x in locs]
    return entities

def fix_entries_ships(row, x):
    ships = match_finder(row, [x['shipOutward'], x['shipReturn']], 80)
    rank =  match_finder(row, [x['dutch_rank']], 80)
    location = match_finder(row, [x['placeOfOrigin']], 90)
#     keywords = match_finder(row, ['kamer van zeeland', 'kamer zeeland', 'kamer van amsterdam', 'kamer amsterdam', 
#                                   'kamer van hoorn', 'kamer hoorn', 'kamer enkhuizen', 'kamer van enkhuizen',
#                                   'kamer delft', 'kamer van delft', 'oostindische compagnie', 'oostindie compagnie',
#                                   'oostindie'], 80)
    true_data_matches = {'ships': ships, 'location':location, 'rank':rank}#, 'keywords':keywords}
    return true_data_matches

In [8]:
voc = pd.read_csv('../vocop_clustered_dutchrank.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
ranking_df = pd.DataFrame(columns=['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships', 'keywords', 'match', 'notary_id', 'voc_id'])
name_dict = voc.fullNameNormalized.value_counts()
for x in notebook.tqdm(final_df[final_df.data_matches != 0].itertuples(), total=final_df[final_df.data_matches != 0].shape[0]):
    notary_date = datetime.strptime(x.datering, '%Y-%m-%d')
    notid = x.Index
    for match in x.data_matches:
        index = str(x.Index) + '_' + str(match['index'])
        vocid=match['index']
        name_ratio=max(fuzz.ratio(x.name, match['name_original']), fuzz.ratio(x.name, match['name_normalized']))
        name_count = name_dict[match['name_normalized']]
        
        if x.vocop_match != 0 and match['index'] == x.vocop_match[1]:
            matched = 1
        else:
            matched = 0
        
        try:
            out_date = datetime.strptime(match['date_out'], '%Y-%m-%d')
        except:
            out_date = datetime(year=1, month=1, day =1 )
        try:
            return_date = datetime.strptime(match['date_return'], '%Y-%m-%d')
        except:
            return_date = datetime(year=1, month=1, day =1 )
        if (notary_date - out_date).days in range(0, -91, -1):
            day_dif = -(notary_date - out_date).days
        elif (notary_date - return_date).days in range(0, 91):
            day_dif = (notary_date - return_date).days
        else:
            continue
        
        
        if str(match['index']) in x.data_entry:
            ship_matches = fix_entries_ships(x, match)
            numships = len(ship_matches['ships'])
            if ship_matches['rank'] != []:
                rank = 1
            else:
                rank = 0
            if ship_matches['location'] != []:
                location = 1
            else:
                location = 0
            if ship_matches['keywords'] != []:
                keywords = 1
            else:
                keywords = 0
        else:
            numships = 0
            rank = 0
            location = 0
        tempdf = pd.DataFrame(columns=['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships', 'keywords', 'match', 'notary_id', 'voc_id'], data={
            'name_ratio':name_ratio,
            'name_count':name_count,
            'day_dif':day_dif, 
            'location':location,
            'rank':rank,
            'numships':numships,
            'keywords':keywords,
            'match':matched,
            'notary_id':notid,
            'voc_id':vocid
        }, index=[index])
        ranking_df = ranking_df.append(tempdf)

In [None]:
ranking_df.to_csv('../preranking.csv', index=False)
#final_df.to_csv('../final_df.csv')
#ranking_df[ranking_df.match == 1]

In [53]:
train, test = train_test_split(final_df, test_size=0.3, random_state=10)

In [None]:
def match_neighbour(start, end, true, prev, distance):
    if true == []:
        return (start, end), prev.i
    if prev.i == len(prev.doc) - 1:
        return (start, end), prev.i
    if fuzz.ratio(true[0].lower(), prev.nbor().text.lower()) >= distance:
        return match_neighbour(start, prev.nbor().idx + len(prev.nbor()), true[1:], prev.nbor(), distance)
    else:
        return (start, end), prev.i

def match_finder(row, match, distance):

    true = match
    doc = nlp(row.text)
    locs = []
    prev = 0
    for token in doc:
        for x in true:
            if type(x) == str:
                new_token = []
                split = x.split(' ')
                for y in range(len(split)):
                    if token.i <= doc[-len(split)].i:
                        new_token.append(token.nbor(y).text)
                new_token_text = ' '.join(new_token)
                if fuzz.ratio(x.lower(), new_token_text.lower()) >= distance:
                    result = (token.idx, token.idx + len(new_token_text))
                    if result not in locs:
                        if fuzz.ratio(row.text[result[0]:result[1]].lower(), x.lower()) >= distance:
                            locs.append(result)
    entities = [row.text[x[0]:x[1]] for x in locs]
    return entities

def fix_entries_ships(row, x, column):
    if type(x[column]) != float: 
        match = match_finder(row, [x[column]], 80) 
    else:
        match = ''
    return match

In [None]:
match_finder(final_df.loc[2690],['Huis ter Duine'], 80)

In [None]:
final_df.loc[3171]

## Dedupe Record Linking

In [54]:
data_n = {}
data_v = {}
df = train
for x in notebook.tqdm(range(len(df))):
    row = df.iloc[x]
    holder = []
    if row.data_matches != 0:
        notary_date = datetime.strptime(row.datering, '%Y-%m-%d')
        
        for match in row.data_matches:
            try:
                out_date = datetime.strptime(match['date_out'], '%Y-%m-%d')
            except:
                out_date = datetime(year=1, month=1, day =1 )
            try:
                return_date = datetime.strptime(match['date_return'], '%Y-%m-%d')
            except:
                return_date = datetime(year=1, month=1, day =1 )
            if (notary_date - out_date).days in range(0, -91, -1):
                day_dif = -(notary_date - out_date).days
            elif (notary_date - return_date).days in range(0, 91):
                day_dif = (notary_date - return_date).days
            else:
                continue
            
            found_ship_out = fix_entries_ships(row, voc.iloc[match['index']], 'shipOutward')
            found_ship_return = fix_entries_ships(row, voc.iloc[match['index']], 'shipReturn')
            found_loc = fix_entries_ships(row, voc.iloc[match['index']], 'placeOfOrigin')
            found_rank = fix_entries_ships(row, voc.iloc[match['index']], 'dutch_rank')
            holder.append({'rank':found_rank, 'location':found_loc, 'found_ship_return': found_ship_return, 'found_ship_out': found_ship_out})
            
            if row.vocop_match != 0 and match['index'] == row.vocop_match[1]:
                cluster = df.iloc[x].name
            else:
                cluster = -int(voc.iloc[match['index']].VOCOP_id)
            name = str(voc.iloc[match['index']].fullNameOriginal)
            rang = str(voc.iloc[match['index']].dutch_rank)
            loc = str(voc.iloc[match['index']].placeOfOrigin)
            ship_out = str(voc.iloc[match['index']].shipOutward) 
            ship_return = str(voc.iloc[match['index']].shipReturn)
            data_v[-int(voc.iloc[match['index']].VOCOP_id)] = {'cluster_id': cluster, 'name':name, 'rank':rang, 'location':loc, 'ship_out': ship_out, 'ship_return': ship_return}
    name = row['name']
    if holder == []:
        rang = None
        location = None
        ship_out= None
        ship_return = None
    
    else:
        rang = ' | '.join(set([y.lower() for x in holder for y in x['rank']]))
        if rang == '':
            rang = None
        location = ' | '.join(set([y.lower() for x in holder for y in x['location']]))
        if location == '':
            location = None
        ship_out = ' | '.join(set([y.lower() for x in holder for y in x['found_ship_out']]))
        if ship_out == '':
            ship_return = None
        ship_return = ' | '.join(set([y.lower() for x in holder for y in x['found_ship_return']]))
        if ship_return == '':
            ship_return = None
    data_n[df.iloc[x].name] = {'cluster_id': df.iloc[x].name, 'name':name, 'rank':rang, 'location':location, 'ship_out': ship_out, 'ship_return': ship_return}

d_n = {'cluster_id': [data_n[x]['cluster_id'] for x in data_n], 'name': [data_n[x]['name'] for x in data_n], 'rank':[data_n[x]['rank'] for x in data_n],
     'location':[data_n[x]['location'] for x in data_n], 'ship_out': [data_n[x]['ship_out'] for x in data_n], 'ship_return': [data_n[x]['ship_return'] for x in data_n]}

d_v = {'cluster_id': [data_v[x]['cluster_id'] for x in data_v], 'name': [data_v[x]['name'] for x in data_v], 'rank':[data_v[x]['rank'] for x in data_v],
     'location':[data_v[x]['location'] for x in data_v], 'ship_out': [data_v[x]['ship_out'] for x in data_v], 'ship_return': [data_v[x]['ship_return'] for x in data_v]}

dedupe_notary = pd.DataFrame(d_n)
dedupe_notary.index.name = 'index'
dedupe_notary.to_csv('../train_notary.csv')
dedupe_voc = pd.DataFrame(d_v)
dedupe_voc.index.name = 'index'
dedupe_voc.to_csv('../train_voc.csv')

HBox(children=(IntProgress(value=0, max=6832), HTML(value='')))




In [55]:
data_n = {}
data_v = {}
df = test
for x in notebook.tqdm(range(len(df))):
    row = df.iloc[x]
    holder = []
    if row.data_matches != 0:
        notary_date = datetime.strptime(row.datering, '%Y-%m-%d')
        
        for match in row.data_matches:
            try:
                out_date = datetime.strptime(match['date_out'], '%Y-%m-%d')
            except:
                out_date = datetime(year=1, month=1, day =1 )
            try:
                return_date = datetime.strptime(match['date_return'], '%Y-%m-%d')
            except:
                return_date = datetime(year=1, month=1, day =1 )
            if (notary_date - out_date).days in range(0, -91, -1):
                day_dif = -(notary_date - out_date).days
            elif (notary_date - return_date).days in range(0, 91):
                day_dif = (notary_date - return_date).days
            else:
                continue
            
            found_ship_out = fix_entries_ships(row, voc.iloc[match['index']], 'shipOutward')
            found_ship_return = fix_entries_ships(row, voc.iloc[match['index']], 'shipReturn')
            found_loc = fix_entries_ships(row, voc.iloc[match['index']], 'placeOfOrigin')
            found_rank = fix_entries_ships(row, voc.iloc[match['index']], 'dutch_rank')
            holder.append({'rank':found_rank, 'location':found_loc, 'found_ship_return': found_ship_return, 'found_ship_out': found_ship_out})
            
            if row.vocop_match != 0 and match['index'] == row.vocop_match[1]:
                cluster = df.iloc[x].name
            else:
                cluster = -int(voc.iloc[match['index']].VOCOP_id)
            name = str(voc.iloc[match['index']].fullNameOriginal)
            rang = str(voc.iloc[match['index']].dutch_rank)
            loc = str(voc.iloc[match['index']].placeOfOrigin)
            ship_out = str(voc.iloc[match['index']].shipOutward) 
            ship_return = str(voc.iloc[match['index']].shipReturn)
            data_v[-int(voc.iloc[match['index']].VOCOP_id)] = {'cluster_id': cluster, 'name':name, 'rank':rang, 'location':loc, 'ship_out': ship_out, 'ship_return': ship_return}
    name = row['name']
    if holder == []:
        rang = None
        location = None
        ship_out= None
        ship_return = None
    
    else:
        rang = ' | '.join(set([y.lower() for x in holder for y in x['rank']]))
        if rang == '':
            rang = None
        location = ' | '.join(set([y.lower() for x in holder for y in x['location']]))
        if location == '':
            location = None
        ship_out = ' | '.join(set([y.lower() for x in holder for y in x['found_ship_out']]))
        if ship_out == '':
            ship_return = None
        ship_return = ' | '.join(set([y.lower() for x in holder for y in x['found_ship_return']]))
        if ship_return == '':
            ship_return = None
    data_n[df.iloc[x].name] = {'cluster_id': df.iloc[x].name, 'name':name, 'rank':rang, 'location':location, 'ship_out': ship_out, 'ship_return': ship_return}

d_n = {'cluster_id': [data_n[x]['cluster_id'] for x in data_n], 'name': [data_n[x]['name'] for x in data_n], 'rank':[data_n[x]['rank'] for x in data_n],
     'location':[data_n[x]['location'] for x in data_n], 'ship_out': [data_n[x]['ship_out'] for x in data_n], 'ship_return': [data_n[x]['ship_return'] for x in data_n]}

d_v = {'cluster_id': [data_v[x]['cluster_id'] for x in data_v], 'name': [data_v[x]['name'] for x in data_v], 'rank':[data_v[x]['rank'] for x in data_v],
     'location':[data_v[x]['location'] for x in data_v], 'ship_out': [data_v[x]['ship_out'] for x in data_v], 'ship_return': [data_v[x]['ship_return'] for x in data_v]}

dedupe_notary = pd.DataFrame(d_n)
dedupe_notary.index.name = 'index'
dedupe_notary.to_csv('../test_notary.csv')
dedupe_voc = pd.DataFrame(d_v)
dedupe_voc.index.name = 'index'
dedupe_voc.to_csv('../test_voc.csv')

HBox(children=(IntProgress(value=0, max=2928), HTML(value='')))




In [None]:
len(test[test.vocop_match != 0])

In [None]:
16 / 26

In [None]:
26 * 0.68

In [None]:
dedupe_notary[dedupe_notary.isna().ship_return == False]

In [None]:
dedupe_voc

In [49]:
def match_finder(row, match, distance):

    if len(match) >  1 and match[1] == 'schip':
        true = [match[0]]
        ship_check = True
    else:
        true = match
        ship_check = False
    doc = nlp(row.text)
    locs = []
    prev = 0
    for token in doc:
        for x in true:
            if type(x) == str:
                new_token = []
                split = x.split(' ')
                for y in range(len(split)):
                    if token.i <= doc[-len(split)].i:
                        new_token.append(token.nbor(y).text)
                new_token_text = ' '.join(new_token)
                if fuzz.ratio(x.lower(), new_token_text.lower()) >= distance:
                    result = (token.idx, token.idx + len(new_token_text))
                    if result not in locs:
                        if fuzz.ratio(row.text[result[0]:result[1]].lower(), x.lower()) >= distance:
                            locs.append(result)
    if ship_check == True and locs == []:
        knowledgebase = voc[(voc.yearBeginService.isin([row.jaar, str(int(row.jaar) - 1), str(int(row.jaar) + 1)])) | 
                            (voc.year_end_service_improved.isin([row.jaar, str(int(row.jaar) - 1), str(int(row.jaar) + 1)]))
                           ]
        ships = set(knowledgebase.shipOutward.str.lower().to_list() + 
                    knowledgebase.shipReturn.str.lower().to_list())
        if len(match) > 2:
            if str(match[2][0]).lower() in ships:
                ships.remove(str(match[2][0]).lower())
            if str(match[2][1]).lower() in ships:
                ships.remove(str(match[2][1]).lower())
        for token in doc:
            if fuzz.ratio('schip', token.text.lower()) >= distance:
                for x in ships:
                    if type(x) == str:
                        new_token = []
                        split = x.split(' ')
                        for y in range(len(split)):
                            if token.nbor().i <= doc[-len(split)].i:
                                new_token.append(token.nbor(1 + y).text)
                        new_token_text = ' '.join(new_token)
                        if fuzz.ratio(x.lower(), new_token_text.lower()) >= distance:
                            result = (token.nbor().idx, token.nbor().idx + len(new_token_text))
                            if result not in locs:
                                if fuzz.ratio(row.text[result[0]:result[1]].lower(), x.lower()) >= distance:
                                    locs.append(result)
    entities = [row.text[x[0]:x[1]] for x in locs]
    return entities

def fix_entries_ships(row, x, column):
    if column == 'shipOutward' or column == 'shipReturn':
        if type(x[column]) != float:
            match = match_finder(row, [x[column], 'schip', (x['shipOutward'], x['shipReturn'])], 80)
        else:
            match = match_finder(row, ['schip'], 80)
    elif type(x[column]) != float: 
        match = match_finder(row, [x[column]], 80) 
    else:
        match = ''
    return match

In [44]:
final_df['jaar'] = [x[0:4] for x in final_df.datering]

In [45]:
voc[(voc.yearBeginService.isin([y, str(int(y) - 1), str(int(y) + 1)])) | 
    (voc.year_end_service_improved.isin([y, str(int(y) - 1), str(int(y) + 1)]))
   ].shipOutward.str.lower().to_list()

TypeError: int() argument must be a string, a bytes-like object or a number, not 'dict'

In [50]:
c = 0
total = 0
for x in notebook.tqdm(final_df[final_df.vocop_match != 0].itertuples(), total=final_df[final_df.vocop_match != 0].shape[0]):
    for y in x.data_matches:
        if y['index'] == x.vocop_match[1]:
            test = fix_entries_ships(x, voc.iloc[y['index']], 'shipOutward')
            if test != '':
                total += 1
            if test not in [[], '']:
                c += 1
                print(test)
#             else:
#                 print(x.Index)
#                 print(x.beschrijving)
#                 print(voc.iloc[y['index']]['shipOutward'])
                
                
print(c, total)

HBox(children=(IntProgress(value=0, max=102), HTML(value='')))

['Nieuland']
['Zaamslad']
['Nieuland']
['Diemen', 'dien']
['Anna']
['Anna']
['dAnna']
['blommendaal']
['Sloterdijk']
['Sulpenbirg']
['rotterdam']
['perzijneburg']
['Eendragt']
['Westriesland']
['Kievitsheuwel']
['Hillegom']
['aarmslag', 'Laamslagh']
['guntersteijn']
['Schellag']
['Anna']
['Leijden']
['ouwerkerk']
['Eendragt']
['haerlem']
['Voorzigtigheijd']
['Nennieuwenkerk']
['Vrouw Petronella']
['Admiraal de Ruijter']
['Borselen']
['gustaaff Willen']
['Langeuijk']
['Buijdorp']
['Buijdorp']
['sgravezonde']
['aschat']
['s Lands welvaren']
['Velsen']
['Cortuijn']
['brouwen']
['huijs te Manpad']
['tevreden']
['Kasteel van Tilburg']
['Stadwijk']
['Westerveld']
['Cattendijk']
['Borselen']
['getrouwigheijd']
['Walcheren']
['Schip Noordnieuwland', 'Noordnieuwland voor']
['Pallas']
['de vrouwellisabeth']
['Vredenhoffte']
['Akerendam']
['Akerendam']
['nieuw Walcheren']
['Diemen']
['Sloterdijk']
['Kasteel van Tilburg']
['Sloten']
['Duijnenburg']
['Leijden']
['Baarsande']
['Luxemburg']
['Langewi

In [24]:
c = 0
total = 0
for x in notebook.tqdm(final_df[final_df.vocop_match != 0].itertuples(), total=final_df[final_df.vocop_match != 0].shape[0]):
    for y in x.data_matches:
        if y['index'] == x.vocop_match[1]:
            test = fix_entries_ships(x, voc.iloc[y['index']], 'shipReturn')
            if test != '':
                total += 1
            if test not in [[], '']:
                c += 1
                print(test)
                
print(c, total)

HBox(children=(IntProgress(value=0, max=102), HTML(value='')))

['Zaamslad']
['Zaamslad']
['Zaamslad']
['Diemen', 'dien']
['Schip']
['dAnna']
['Schip']
['Sloterdijk']
['Standvastigheijd']
['Sulpenbirg']
['Jaers', 'Jager']
['Schip']
['perzijneburg']
['Schip']
['Marienbss']
['Westriesland']
['Kievitsheuwel']
['Hillegom']
['aarmslag', 'Laamslagh']
['Arnesteijn']
['Schellag']
['Schip']
['Leijden']
['bosbeek', 'Bosbeek']
['haerlem']
['osdorp']
['Noterdijk']
['Leckerlust']
['Admiraal de Ruijter']
['Wildrijk']
['Leekenland']
['Bosschenhoven']
['vrouw Elisabeth']
['vrouw Elisabeth']
['sgravezonde']
['Schip']
['s Lands welvaren']
['Velsen']
['Schip']
['Kasteel van Tilburg']
['Duijnenbur']
['de drie papezaijen']
['Westerveld']
['de vrouw ElisabethOorothea', 'vrouw ElisabethOorothea voor']
['Borselen']
['Jerusalen']
['Damzigt']
['Noordnieuwland voor']
['Pallas']
['Schip', 'Schip', 'Schip']
['Schip']
['Akerendam']
['Akerendam']
['krabbendijke']
['Crabbendijk']
['Crabbendijk']
['Anna']
['Sloterdijk']
['Schip']
['Sloten']
['Duijnenburg']
['Schip']
['Baarsande']


In [None]:
c = 0
total = 0
for x in notebook.tqdm(final_df[final_df.vocop_match != 0].itertuples(), total=final_df[final_df.vocop_match != 0].shape[0]):
    for y in x.data_matches:
        if y['index'] == x.vocop_match[1]:
            test = fix_entries_ships(x, voc.iloc[y['index']], 'dutch_rank')
            if test != '':
                total += 1
            if test not in [[], '']:
                c += 1
                
print(c, total)

In [None]:
c = 0
total = 0
for x in notebook.tqdm(final_df[final_df.vocop_match != 0].itertuples(), total=final_df[final_df.vocop_match != 0].shape[0]):
    for y in x.data_matches:
        if y['index'] == x.vocop_match[1]:
            test = fix_entries_ships(x, voc.iloc[y['index']], 'placeOfOrigin')
            if test != '':
                total += 1
            if test not in [[], '']:
                c += 1
                
print(c, total)

In [None]:
final_df.loc[7280]