In [2]:
import pandas as pd
import numpy as np
from tqdm import notebook
import ast
import re
from fuzzywuzzy import fuzz
from datetime import datetime, timedelta
import spacy

In [3]:
def translate_rank(rank, translator):
    if rank in translator['(all occupations)'].tolist():
        return translator.index[translator['(all occupations)'] == rank][0]
    else:
        return rank

In [4]:
notebook.tqdm.pandas()
clean = pd.read_csv('../../clean_data.csv')
voc_df = pd.read_csv('../vocop-clustered-new.csv', sep='	')
uuid = []
name = []
for y, z in notebook.tqdm(clean.iterrows()):
    for x in ast.literal_eval(z.namen):
        if x['tussenvoegsel'] != None:
            name.append(x['voornaam'] + " " + x['tussenvoegsel'] + " " + x['achternaam'])
            uuid.append(z.uuid)
        elif x['voornaam'] and x['achternaam'] != None:
            name.append(x['voornaam'] + " " + x['achternaam'])
            uuid.append(z.uuid)
name_list = pd.DataFrame(data={'uuid':uuid, 'name':name}, columns=['uuid', 'name'])
name_df = clean.merge(name_list)
rangen = pd.read_excel('../../vocop_rangen.xlsx', index_col=0)
voc_df['dutch_rank'] = [translate_rank(x, rangen) for x in notebook.tqdm(voc_df['rank'].tolist())]

  from pandas import Panel
  interactivity=interactivity, compiler=compiler, result=result)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=774200), HTML(value='')))




In [5]:
def fuzzy_search(name, voc, distance):
    names = np.where((voc.fullNameNormalized.apply(fuzz.ratio, args=[name]) >= 90) | 
                     (voc.fullNameOriginal.dropna().apply(fuzz.ratio, args=[name]) >= 90))
    return names[0]

def find_matches(names, voc, distance):
    name_list = {}
    final = []
    for x in notebook.tqdm(names):
        if x in name_list:
            final.append((name_list[x]))
        else:
            result = fuzzy_search(x, voc, distance)
            name_list[x] = result
            final.append(result)
    return final

def get_voc_data(matches, voc):
    if len(matches) != 0:
        data = []
        for index in matches:
            holder = {}
            holder['index'] = index
            holder['name_original'] = voc.iloc[index].fullNameOriginal
            holder['name_normalized'] = voc.iloc[index].fullNameNormalized
            holder['date_out'] = voc.iloc[index].date_begin_service_complete
            holder['date_return'] = voc.iloc[index].date_end_service_complete
            holder['ship_out'] = voc.iloc[index].shipOutward
            holder['ship_return'] = voc.iloc[index].shipReturn
            holder['rank'] = voc.iloc[index]['dutch_rank']
            holder['place_of_origin'] = voc.iloc[index].placeOfOrigin
            data.append(holder)
        return data
    else:
        return 0

def get_notary_matches(row, nlp):
    holder = {}
    if row.data_matches != 0:
        for x in row.data_matches:
            ship_out, ship_return = str(x['ship_out']), str(x['ship_return']) 
            rank, place_of_origin = str(x['rank']), str(x['place_of_origin'])
            detected_ships = []
            detected_ranks = []
            detected_location = []
            
            # Find Ships
            if fuzz.partial_ratio(ship_out.lower(), str(row.beschrijving).lower()) >= 80 or fuzz.partial_ratio(ship_out.lower(), row.text.lower()) >= 80:
                    detected_ships.append(ship_out)
            if fuzz.partial_ratio(ship_return.lower(), str(row.beschrijving).lower()) >= 80 or fuzz.partial_ratio(ship_return.lower(), row.text.lower()) >= 80:
                    detected_ships.append(ship_return)

            # Find Rank
            if fuzz.partial_ratio(rank.lower(), str(row.beschrijving).lower()) >= 80 or fuzz.partial_ratio(rank.lower(), row.text.lower()) >= 80:
                detected_ranks.append(rank)

            # Find Place of Origin
            if fuzz.partial_ratio(place_of_origin.lower(), str(row.beschrijving).lower()) >= 80 or fuzz.partial_ratio(place_of_origin.lower(), row.text.lower()) >= 80:
                detected_location.append(place_of_origin)
            else:
                for ent in nlp(row.text).ents:
                    if ent.label_ == 'LOC':
                        if fuzz.ratio(place_of_origin, ent.text) >= 85:
                            detected_location.append(place_of_origin)
            holder[x['index']] = {'ships':detected_ships, 'rank':detected_ranks, 'location':detected_location}
        return holder
    else:
        return 0

def create_annotation_subset(notary, voc):
    nlp = spacy.load('nl_core_news_sm')
    return_df = notary.copy()
    return_df['index_matches'] = find_matches(return_df.name, voc, 80)
    return_df['data_matches'] = return_df.index_matches.progress_apply(get_voc_data, args=[voc])
    return_df = return_df.drop('index_matches', axis=1)
    return_df['data_entry'] = return_df.progress_apply(get_notary_matches, args=[nlp], axis=1)
    return return_df

# def annotate(row):
#     notary_date = datetime.strptime(row.datering, '%Y-%m-%d')
#     if row.data_matches != '0':
#         #print(row.data_matches)
#         #for person in row.data_matches:
#         for person in row.data_matches:
#             try:
#                 out_date = datetime.strptime(person['date_out'], '%Y-%m-%d')
#             except:
#                 out_date = datetime(year=1, month=1, day =1 )
#             try:
#                 return_date = datetime.strptime(person['date_return'], '%Y-%m-%d')
#             except:
#                 return_date = datetime(year=1, month=1, day =1 )
#             if (notary_date - out_date).days not in range(0, -91, -1) and (notary_date - return_date).days not in range(0, 91):
#                 #print('Skipped match')
#                 continue
                
#             else:
#                 print('{:10} | {:30} | {}'.format(' ', "Notary Information " + str(row.name), 'VOC Information ' + str(person['index'])))
#                 print('-' * 108)
#                 print('{:10} | {:30} | {} / {}'.format('Name', row['name'], person['name_original'], person['name_normalized']))
#                 print('{:10} | {:30} | {} / {}'.format('Dates', row.datering, person['date_out'], person['date_return']))
#                 print('{:10} | {:30} | {} / {}'.format('Ships', ' / '.join(row['data_entry'][str(person['index'])]['ships']), person['ship_out'], person['ship_return']))
#                 print('{:10} | {:30} | {}'.format('Rank', ' / '.join(row['data_entry'][str(person['index'])]['rank']), person['rank']))
#                 print('{:10} | {:30} | {}'.format('Locations', ' / '.join(row['data_entry'][str(person['index'])]['location']), person['place_of_origin']))
#                 check = False
#                 print('Are these persons the same? y/n:')
#                 while check != True:
#                     answer = input()
#                     if answer == 'y':
#                         return (person['name_original'], person['index'])
#                         check = True
#                     elif answer == 'n':
#                         check = True
#                     elif answer == 'text':
#                         print(row.text)
#                     else:
#                         print("Invalid input please enter 'y', 'n', or 'text' without the quotes.")
#         return None
                
#     else:
#         return None

In [6]:
def annotate(df, prev=None):
    if prev is not None:
        final = prev.copy()
        start = len(final)
    else:
        final = pd.DataFrame(columns = df.columns)
        start = 0
    holder = []
    #stop = False
    #while stop != True:
    for row in df[start:].itertuples():
        notary_date = datetime.strptime(row.datering, '%Y-%m-%d')
        if row.data_matches != '0':
            #print(row.data_matches)
            #for person in row.data_matches:
            for person in row.data_matches:
                try:
                    out_date = datetime.strptime(person['date_out'], '%Y-%m-%d')
                except:
                    out_date = datetime(year=1, month=1, day =1 )
                try:
                    return_date = datetime.strptime(person['date_return'], '%Y-%m-%d')
                except:
                    return_date = datetime(year=1, month=1, day =1 )
                if (notary_date - out_date).days not in range(0, -91, -1) and (notary_date - return_date).days not in range(0, 91):
                    #print('Skipped match')
                    continue

                else:
                    print('{:10} | {:30} | {}'.format(' ', "Notary Information " + str(row.name), 'VOC Information ' + str(person['index'])))
                    print('-' * 108)
                    print('{:10} | {:30} | {} / {}'.format('Name', row.name, person['name_original'], person['name_normalized']))
                    print('{:10} | {:30} | {} / {}'.format('Dates', row.datering, person['date_out'], person['date_return']))
                    print('{:10} | {:30} | {} / {}'.format('Ships', ' / '.join(row.data_entry[str(person['index'])]['ships']), person['ship_out'], person['ship_return']))
                    print('{:10} | {:30} | {}'.format('Rank', ' / '.join(row.data_entry[str(person['index'])]['rank']), person['rank']))
                    print('{:10} | {:30} | {}'.format('Locations', ' / '.join(row.data_entry[str(person['index'])]['location']), person['place_of_origin']))
                    check = False
                    print('Are these persons the same? y/n:')
                    while check != True:
                        answer = input()
                        if answer == 'y':
                            holder.append((person['name_original'], person['index']))
                            final = final.append(df.loc[row.Index])
                            check = True
                        elif answer == 'n':
                            check = True
                        elif answer == 'text':
                            print(row.text)
                        elif answer == 'stop':
                            final['vocop_match'][start:] = holder
                            return final
                        else:
                            print("Invalid input please enter 'y', 'n', 'stop', or 'text' without the quotes.")
                    if answer == 'y':
                        break
                    else:
                        continue
            if answer == 'y':
                continue
            else:
                holder.append(0)
                final = final.append(df.loc[row.Index])
        else:
            holder.append(0)
            final = final.append(df.loc[row.Index])
    final['vocop_match'][start:] = holder
    return final

In [2]:
#henk1 = create_annotation_subset(name_df[0:10].copy(), voc_df)
#henk1 = create_annotation_subset(name_df[0:5000].copy(), voc_df)
#henk1 = create_annotation_subset(name_df[54465:54467].copy(), voc_df)

In [11]:
#henk1.to_json('subset1.json')
#subset = pd.read_json('result.json')

In [8]:
t30t35 = create_annotation_subset(name_df[33280:33880].copy(), voc_df)

HBox(children=(IntProgress(value=0, max=600), HTML(value='')))




HBox(children=(IntProgress(value=0, max=600), HTML(value='')))




HBox(children=(IntProgress(value=0, max=600), HTML(value='')))




In [9]:
t30t35.to_json('subset3.json')

In [12]:
sub = pd.read_json('subset1.json')
sub2 = pd.read_json('subset2.json')
sub3 = pd.read_json('subset3.json')

In [13]:
sub3[sub3.data_matches != 0]

Unnamed: 0,uuid,rubriek,notaris,inventarisNr,akteNr,akteType,datering,taal,beschrijving,namen,urls,text,name,data_matches,data_entry
33280,45598a08-aa02-671a-bfb7-2852cfe2c3c1,358,JAN VERLEIJ,11894,11432,Kwitantie,1745-08-28,nederlands,,"[{'voornaam': 'Jan', 'tussenvoegsel': None, 'a...","['KLAB05414000308.JPG', 'KLAB05414000309.JPG',...",No: 390 Quitantie gepasseert den 28 Augustus 1...,Jan Ridder,"[{'index': 56263, 'name_original': 'Jan Bidder...","{'56263': {'ships': ['nan'], 'rank': [], 'loca..."
33281,45598a08-aa02-671a-bfb7-2852cfe2c3c1,358,JAN VERLEIJ,11894,11432,Kwitantie,1745-08-28,nederlands,,"[{'voornaam': 'Jan', 'tussenvoegsel': None, 'a...","['KLAB05414000308.JPG', 'KLAB05414000309.JPG',...",No: 390 Quitantie gepasseert den 28 Augustus 1...,Daniel Voerman,"[{'index': 129945, 'name_original': 'Daniel Ve...","{'129945': {'ships': ['nan'], 'rank': [], 'loc..."
33282,45598a08-aa02-671a-bfb7-2852cfe2c3c1,358,JAN VERLEIJ,11894,11432,Kwitantie,1745-08-28,nederlands,,"[{'voornaam': 'Jan', 'tussenvoegsel': None, 'a...","['KLAB05414000308.JPG', 'KLAB05414000309.JPG',...",No: 390 Quitantie gepasseert den 28 Augustus 1...,Roeloff Roeloffsz,"[{'index': 1511, 'name_original': 'Roeloff Roe...","{'1511': {'ships': ['nan'], 'rank': [], 'locat..."
33283,45598a08-aa02-671a-bfb7-2852cfe2c3c1,358,JAN VERLEIJ,11894,11432,Kwitantie,1745-08-28,nederlands,,"[{'voornaam': 'Jan', 'tussenvoegsel': None, 'a...","['KLAB05414000308.JPG', 'KLAB05414000309.JPG',...",No: 390 Quitantie gepasseert den 28 Augustus 1...,Cornelis de Koster,"[{'index': 1636, 'name_original': 'Cornelis de...","{'1636': {'ships': ['nan'], 'rank': [], 'locat..."
33285,470f3dc5-439c-b1ed-770a-4f1bf4eb7e61,358,JAN VERLEIJ,11894,11437,Machtiging,1745-09-04,nederlands,"\nschip Knapenhoff, VOC kamer Hoorn; penningen...","[{'voornaam': 'Teeke', 'tussenvoegsel': 'de', ...","['KLAB05414000346.JPG', 'KLAB05414000347.JPG',...",No: 403 Procuratie gepasseert den 4 Sept: 1745...,Jan Jansz,"[{'index': 151, 'name_original': 'Jan Jansz.',...","{'151': {'ships': ['WAARDE'], 'rank': [], 'loc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33863,5503051d-245f-fd97-ba4f-571f5efad6ab,358,JAN VERLEIJ,11901,65566,Insinuatie,1747-06-23,nederlands,\nonkosten ligdagen\n,"[{'voornaam': 'Fredrik', 'tussenvoegsel': None...","['A27120000239.JPG', 'A27120000240.JPG', 'A271...",No: 246 Insinuatie gedaa Jan j 1717ED Horerens...,Fredrik Ruurts,"[{'index': 222062, 'name_original': 'Fredrik R...","{'222062': {'ships': [], 'rank': [], 'location..."
33867,64e9d378-5f02-299c-630f-eb10e744e6f7,358,JAN VERLEIJ,11901,65597,Machtiging,1747-07-24,nederlands,"\nschip Godschalksoort, VOC kamer Amsterdam; s...","[{'voornaam': 'Jacobus', 'tussenvoegsel': 'de'...","['A27120000402.JPG', 'A27120000403.JPG']",No: 298 Procuratie Gepasseert den 24: Jacq 175...,Jacobus de Heer,"[{'index': 7410, 'name_original': 'Jacobus de ...","{'7410': {'ships': [], 'rank': [], 'location':..."
33870,73fa9c4c-2e2b-8e14-b3f1-c712f58ce3ea,358,JAN VERLEIJ,11901,65622,Machtiging,1747-06-26,nederlands,,"[{'voornaam': 'Alexander', 'tussenvoegsel': 'L...","['A27120000251.JPG', 'A27120000252.JPG']",N: 240 Procuratie Jaaij 1755AchoepHeeen den 2 ...,Mathijs Roos,"[{'index': 10959, 'name_original': 'Matthijs R...","{'10959': {'ships': ['nan'], 'rank': [], 'loca..."
33871,73fa9c4c-2e2b-8e14-b3f1-c712f58ce3ea,358,JAN VERLEIJ,11901,65622,Machtiging,1747-06-26,nederlands,,"[{'voornaam': 'Alexander', 'tussenvoegsel': 'L...","['A27120000251.JPG', 'A27120000252.JPG']",N: 240 Procuratie Jaaij 1755AchoepHeeen den 2 ...,Jan Droge,"[{'index': 15930, 'name_original': 'Jan Drooge...","{'15930': {'ships': ['nan'], 'rank': [], 'loca..."


In [20]:
sub[0:661].to_json('batch1_patrick.json')
sub[661:1350].to_json('batch1_thom.json')
sub[1350:1924].to_json('batch1_chiel.json')

In [25]:
sub[1924:2529].to_json('batch2_patrick.json')
sub[2529:3134].to_json('batch2_thom.json')
sub[3134:3797].to_json('batch2_chiel.json')

In [45]:
sub[3793:4416].to_json('batch3_patrick.json')
sub[4416:5000].to_json('batch3_thom.json')
sub2[0:611].to_json('batch3_chiel.json')

In [13]:
#sub2[1226:1936].to_json('batch1.json')
sub2[3258:3844].to_json('batch2.json')
sub2[3844:4458].to_json('batch3.json')
sub2[4458:10000].to_json('batch4.json')

In [8]:
sub2[1226:1936].to_json('batch4_patrick.json')
sub2[1936:2598].to_json('batch5_patrick.json')
sub2[2598:3258].to_json('batch6_patrick.json')

In [6]:
voc_df = pd.read_csv('../vocop-clustered-new.csv', sep='	')

  interactivity=interactivity, compiler=compiler, result=result)
