In [223]:
import spacy
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from tqdm import notebook
from datetime import datetime
import ast
import timeit
import tensorflow as tf
from tensorflow.compat.v1 import Session
from tensorflow.python.saved_model import loader
notebook.tqdm.pandas()

In [79]:
nlp2 = spacy.load('../Spacy models/kfold_3', disable=['parser', 'tagger', 'textcat'])
notary = pd.read_csv('../clean_data.csv')
voc = pd.read_csv('vocop_clustered_dutchrank.csv')
#voc = pd.read_csv('vocop-clustered-new.csv', sep='	')
#rangen = pd.read_excel('../vocop_rangen.xlsx', index_col=0)
#voc['dutch_rank'] = [translate_rank(x, rangen) for x in notebook.tqdm(voc['rank'].tolist())]

## NER functions

In [3]:
def NER(text, nlp):
    ''' Takes in a string and uses the specified NLP model to tag entities within the string.
        Returns all entities tagged as PERSON within the string.
    '''
    # Tag text
    doc = nlp(text)
    holder = []
    
    # Append all entities tagged as PERSON
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ' ' in ent.text:
            holder.append(ent.text)
    return holder

## NEL functions

In [295]:
def serialize_example(values):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    def _float_feature(value):
        """Returns an float_list from a int/float."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
    feature = {}
    for x in enumerate(values):
        feature[str(x[0] + 1)] = _float_feature(values[x[0]])

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def ranking(matches, directory):
    tags=["serve"]
    signature_def_key = "predict"
    saved_model_dir = directory
    holder = []
    with Session() as sess:
        loader.load(sess, tags, saved_model_dir)
        serialized_examples = []
        vocop_ids = [x for x in matches]
        for x in matches:
            names = matches[x][0]
            days = matches[x][2]
            locations = matches[x][3]
            ranks = matches[x][4]
            ships = matches[x][5]
            name_count = matches[x][1]
            serialized_example = serialize_example([names, name_count, days, locations, 
                                                   ranks, ships] )
            serialized_examples.append(serialized_example)
            inputs_feed_dict = {'input_example_tensor:0': serialized_examples}
            outputs = sess.run('groupwise_dnn_v2/accumulate_scores/div_no_nan:0', feed_dict=inputs_feed_dict)
            output = [(outputs[x], vocop_ids[x]) for x in range(len(outputs))]
            holder = output
    return holder

In [310]:
def match_neighbour(start, end, true, prev, distance):
    if true == []:
        return (start, end), prev.i
    if prev.i == len(prev.doc) - 1:
        return (start, end), prev.i
    if fuzz.ratio(true[0].lower(), prev.nbor().text.lower()) >= distance:
        return match_neighbour(start, prev.nbor().idx + len(prev.nbor()), true[1:], prev.nbor(), distance)
    else:
        return (start, end), prev.i

def match_finder(row, match, distance):

    true = match
    doc = nlp2(row)
    locs = []
    prev = 0
    for token in doc:
            for x in true:
                if token.i > prev and type(x) == str:
                    split = x.split(' ')
                    if fuzz.ratio(split[0].lower(), token.text.lower()) >= distance:
                        result, prev = match_neighbour(token.idx, token.idx + len(token), split[1:], token, distance)
                        if result not in locs:
                            locs.append(result)
    entities = [row[x[0]:x[1]] for x in locs]
    return entities

def find_matches(name, knowledgebase, target_column, distance=90):
    ''' Takes in a string containing the name of a person and returns all possible matches from the
        knowledgebase based on fuzzy string matching.
    '''
    matches = knowledgebase[(knowledgebase[target_column].str.lower().astype(str).apply(fuzz.ratio, args=[name.lower()]) >= distance)]
    return matches

def convert_matches(entity, matches, name_dict):
    
    holder = {}
    notary_date = datetime.strptime(entity.datering, '%Y-%m-%d')
    
    # Try to convert the dates for each entry into datetime format
    for x in range(matches.shape[0]):
        try:
            date1 = datetime.strptime(matches['date_begin_service_complete'].iloc[x], '%Y-%m-%d')
        except:
            date1 = datetime(year=1, month=1, day =1 )
        try:
            date2 = datetime.strptime(matches['date_end_service_complete'].iloc[x], '%Y-%m-%d')
        except:
            date2 = datetime(year=1, month=1, day =1 )
            
        # Keep only matches that are within distance days from notary_date
        if abs((notary_date - date1).days) < 90 or abs((notary_date - date2).days) < 90:
            name_ratio = fuzz.ratio(entity.naam, matches['fullNameOriginal'].iloc[x])
            name_count = name_dict[matches['fullNameNormalized'].iloc[x]]
            vocid=matches['VOCOP_id'].iloc[x]
            day_dif = min([abs((notary_date - date1).days), abs((notary_date - date2).days)])
            location = len(match_finder(entity.text, [matches['placeOfOrigin'].iloc[x]], 80))
            rank = len(match_finder(entity.text, [matches['dutch_rank'].iloc[x]], 80))
            numships = len(match_finder(entity.text, [matches['shipOutward'].iloc[x], matches['shipReturn'].iloc[x]], 80))
            holder[vocid] = [name_ratio, name_count, day_dif, location, rank, numships]

    return holder

def NEL(entity, knowledgebase, model_dir):
    ''' Takes in a string containing an entity and returns either a match within the specified pandas 
        knowledgebase or None if no match is present.
    ''' 
    # Find and narrow down matches
    possible_matches = find_matches(entity.naam, knowledgebase, 'fullNameOriginal')
    name_dict = knowledgebase.fullNameNormalized.value_counts()
    converted_matches = convert_matches(entity, possible_matches, name_dict)
    if converted_matches == {}:
        return None
    ranked_matches = ranking(converted_matches, model_dir)
    return max(ranked_matches)

## Final Function

In [311]:
def Pipeline(row, knowledgebase, NER_model, NEL_model):
    holder = []
    entities = NER(row.text, NER_model)
    for x in entities:
        row['naam'] = x
        holder.append(NEL(row, knowledgebase, NEL_model))
    return holder


In [312]:
start_time = timeit.default_timer()
x = Pipeline(notary.iloc[85], voc, nlp2, 'ranking_crossvalidation/kfold5/export/1591180581')
elapsed = timeit.default_timer() - start_time
print('Seconds for single query: ' + str(elapsed))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


_______________________________________
Pieter Klijn
{764045: [96, 46, 18, 0, 0, 0], 784359: [100, 46, 7, 0, 0, 0], 802002: [100, 46, 55, 3, 0, 2], 814032: [100, 46, 38, 3, 0, 0]}
INFO:tensorflow:Restoring parameters from ranking_crossvalidation/kfold5/export/1591180581\variables\variables
_______________________________________
Jan Backer
{1150409: [90, 91, 88, 0, 0, 0], 1154910: [90, 91, 47, 3, 0, 0]}
INFO:tensorflow:Restoring parameters from ranking_crossvalidation/kfold5/export/1591180581\variables\variables
_______________________________________
Jan Backer
{1150409: [90, 91, 88, 0, 0, 0], 1154910: [90, 91, 47, 3, 0, 0]}
INFO:tensorflow:Restoring parameters from ranking_crossvalidation/kfold5/export/1591180581\variables\variables
Seconds for single query: 22.93101619999925


In [313]:
x

[(array([2.1179523], dtype=float32), 802002),
 (array([-17.653296], dtype=float32), 1150409),
 (array([-17.653296], dtype=float32), 1150409)]

In [298]:
voc.iloc[284049]['VOCOP_id']

802002

In [243]:
notary[notary.uuid == 'ee21c8bc-bfea-25ee-c6c1-ddf74644012b']

Unnamed: 0,uuid,rubriek,notaris,inventarisNr,akteNr,akteType,datering,taal,beschrijving,namen,urls,text
85,ee21c8bc-bfea-25ee-c6c1-ddf74644012b,358,JAN VERLEIJ,11890,12427,Obligatie,1744-09-12,nederlands,\nschip Diemen\n,"[{'voornaam': 'Pieter', 'tussenvoegsel': None,...","['KLAB05439000262.JPG', 'KLAB05439000263.JPG',...",Ragd werlke Getijnke Gepasseert Den 12 feptemb...


In [103]:
fuzz.ratio('slot van kapelle', 'slot capelle')

79