In [1]:
import pandas as pd
import numpy as np
from tqdm import notebook
import ast
import re
from fuzzywuzzy import fuzz
import os
import tensorflow as tf
import tensorflow_ranking as tfr
from tensorflow.compat.v1 import Session
from tensorflow.python.saved_model import loader
from sklearn.model_selection import KFold
tf.get_logger().setLevel('ERROR')

In [None]:
notebook.tqdm.pandas()
clean = pd.read_csv('../clean_data.csv')
voc = pd.read_csv('vocop-clustered-new.csv', sep='	')

In [None]:
uuid = []
name = []
for y, z in notebook.tqdm(clean.iterrows()):
    for x in ast.literal_eval(z.namen):
        if x['tussenvoegsel'] != None:
            name.append(x['voornaam'] + " " + x['tussenvoegsel'] + " " + x['achternaam'])
            uuid.append(z.uuid)
        elif x['voornaam'] and x['achternaam'] != None:
            name.append(x['voornaam'] + " " + x['achternaam'])
            uuid.append(z.uuid)
name_list = pd.DataFrame(data={'uuid':uuid, 'name':name}, columns=['uuid', 'name'])

In [None]:
name_df = clean.merge(name_list)

In [None]:
def fuzzy_search(name, distance):
    names = np.where((voc.fullNameNormalized.apply(fuzz.ratio, args=[name]) >= 90) | 
                     (voc.fullNameOriginal.dropna().apply(fuzz.ratio, args=[name]) >= 90))
    final = (name, names)
    return final

def find_matches(names, distance):
    name_list = {}
    final = []
    for x in notebook.tqdm(names):
        if x in name_list:
            final.append((x, name_list[x]))
        else:
            result = fuzzy_search(x, distance)
            name_list[x] = result
            final.append((x, result))
    return final

## Ranking

### Preparing the data

In [2]:
df = pd.read_csv('preranking.csv')
manar = []
menar = []
dinar = []
manac = []
menac = []
dinac = []
maday = []
meday = []
diday = []
maloc = []
meloc = []
diloc = []
maran = []
meran = []
diran = []
manum = []
menum = []
dinum = []
for not_id in df.notary_id.unique():
    subset = df[df.notary_id == not_id]
    for x in subset.itertuples():
        max_name_ratio = subset.name_ratio.max()
        mean_name_ratio = subset.name_ratio.mean()
        dif_name_ratio = max_name_ratio - mean_name_ratio
        manar.append(max_name_ratio)
        menar.append(mean_name_ratio)
        dinar.append(dif_name_ratio)
        
        max_name_count = subset.name_count.max()
        mean_name_count = subset.name_count.mean()
        dif_name_count = max_name_count - mean_name_count
        manac.append(max_name_count)
        menac.append(mean_name_count)
        dinac.append(dif_name_count)
        
        max_day_dif = subset.day_dif.max()
        mean_day_dif = subset.day_dif.mean()
        dif_day_dif = max_day_dif - mean_day_dif
        maday.append(max_day_dif)
        meday.append(mean_day_dif)
        diday.append(dif_day_dif)
        
        max_location = subset.location.max()
        mean_location = subset.location.mean()
        dif_location = max_location - mean_location
        maloc.append(max_location)
        meloc.append(mean_location)
        diloc.append(dif_location)
        
        max_rank = subset['rank'].max()
        mean_rank = subset['rank'].mean()
        dif_rank = max_rank - mean_rank
        maran.append(max_rank)
        meran.append(mean_rank)
        diran.append(dif_rank)
        
        max_numships = subset.numships.max()
        mean_numships = subset.numships.mean()
        dif_numships = max_numships - mean_numships
        manum.append(max_numships)
        menum.append(mean_numships)
        dinum.append(dif_numships)
        
df['max_name_ratio'] = manar
df['mean_name_ratio'] = menar
df['dif_name_ratio'] = dinar
df['max_name_count'] = manac
df['mean_name_count'] = menac
df['dif_name_count'] = dinac
df['max_day_dif'] = maday
df['mean_day_dif'] = meday
df['dif_day_dif'] = diday
df['max_location'] = maloc
df['mean_location'] = meloc
df['dif_location'] = diloc
df['max_rank'] = maran
df['mean_rank'] = meran
df['dif_rank'] = diran
df['max_numships'] = manum
df['mean_numships'] = menum
df['dif_numships'] = dinum


#     df2 = pd.DataFrame([[name_ratio, name_count, day_dif, location, rank, numships, match, not_id, 'NIL']],
#                         columns= ['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships',
#                                   'match', 'notary_id', 'voc_id'])
#     df = df.append(df2, ignore_index=True)

indexes = [x for x in np.random.choice(df.notary_id.unique(), int(len(df.notary_id.unique()) *0.7), replace=False)]
train = df[df.notary_id.isin(indexes)]
testval = df[df.notary_id.isin(indexes) == False]

test_indexes = [x for x in np.random.choice(testval.notary_id.unique(), len(testval.notary_id.unique()) // 2, replace=False)]
test = testval[testval.notary_id.isin(test_indexes)]
val = testval[testval.notary_id.isin(test_indexes) == False]


In [8]:
df[50:100]

Unnamed: 0,name_ratio,name_count,day_dif,location,rank,numships,match,notary_id,voc_id,max_name_ratio,...,dif_day_dif,max_location,mean_location,dif_location,max_rank,mean_rank,dif_rank,max_numships,mean_numships,dif_numships
50,100,1383,79,1,0,0,0,506,423058,100,...,21.25,1,0.25,0.75,1,0.083333,0.916667,0,0.0,0.0
51,96,46,18,0,0,0,0,509,271982,100,...,25.5,1,0.5,0.5,0,0.0,0.0,2,0.5,1.5
52,100,46,7,0,0,0,0,509,277221,100,...,25.5,1,0.5,0.5,0,0.0,0.0,2,0.5,1.5
53,100,46,55,1,0,2,1,509,284049,100,...,25.5,1,0.5,0.5,0,0.0,0.0,2,0.5,1.5
54,100,46,38,1,0,0,0,509,288576,100,...,25.5,1,0.5,0.5,0,0.0,0.0,2,0.5,1.5
55,95,95,55,1,0,0,0,510,280876,100,...,16.5,1,0.5,0.5,0,0.0,0.0,0,0.0,0.0
56,100,95,88,0,0,0,0,510,443833,100,...,16.5,1,0.5,0.5,0,0.0,0.0,0,0.0,0.0
57,90,91,88,0,0,0,0,511,431209,90,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
58,92,7,56,1,0,0,0,512,298637,92,...,0.0,1,1.0,0.0,0,0.0,0.0,0,0.0,0.0
59,100,6,31,0,0,0,0,513,428227,100,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


In [3]:
train = pd.read_json('trainltr.json')

In [4]:
test = pd.read_json('testltr.json')

In [5]:
val = pd.read_json('valltr.json')

In [6]:
subcolumns = ['name_ratio', 'name_count', 'day_dif', 'location', 'rank', 'numships',] 
#               'max_name_ratio', 'mean_name_ratio', 'dif_name_ratio',
#               'max_name_count', 'mean_name_count', 'mean_name_count', 
#               'max_day_dif', 'mean_day_dif', 'dif_day_dif', 
#               'max_location', 'mean_location', 'dif_location', 
#               'max_rank', 'mean_rank', 'dif_rank', 
#               'max_numships', 'mean_numships', 'dif_numships']
# subcolumns = ['name_ratio', 'day_dif', 'location', 'rank', 'numships', 'name_count', 
#               'max_name_ratio', 'mean_name_ratio', 'dif_name_ratio',
#               'max_name_count', 'mean_name_count', 'mean_name_count', 
#               'max_day_dif', 'mean_day_dif', 'dif_day_dif', 
#               'max_location', 'mean_location', 'dif_location', 
#               'max_rank', 'mean_rank', 'dif_rank', 
#               'max_numships', 'mean_numships', 'dif_numships']

In [None]:
file = open('train.txt', 'w')
for x in train.itertuples():
    line = str(x.match) + ' qid:' + str(x.notary_id)
    for y in enumerate(subcolumns):
        if getattr(x, y[1]) != 0:
            line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
    if x.Index + 1 == df.shape[0]:
        file.writelines(line)
    else:
        file.writelines(line + '\n')
file.close()

In [None]:
file = open('test.txt', 'w')
for x in test.itertuples():
    line = str(x.match) + ' qid:' + str(x.notary_id)
    for y in enumerate(subcolumns):
        if getattr(x, y[1]) != 0:
            line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
    if x.Index + 1 == df.shape[0]:
        file.writelines(line)
    else:
        file.writelines(line + '\n')
file.close()

In [None]:
file = open('vali.txt', 'w')
for x in val.itertuples():
    line = str(x.match) + ' qid:' + str(x.notary_id)
    for y in enumerate(subcolumns):
        if getattr(x, y[1]) != 0:
            line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
    if x.Index + 1 == df.shape[0]:
        file.writelines(line)
    else:
        file.writelines(line + '\n')
file.close()

In [None]:
# !saved_model_cli show \
#     --dir six_features/export/1590418714 \
#     --tag_set serve \
#     --signature_def predict 

In [7]:
def serialize_example(values, subcolumn):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    def _float_feature(value):
        """Returns an float_list from a int/float."""
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
    feature = {}
    for x in enumerate(subcolumn):
        feature[str(x[0] + 1)] = _float_feature(values[x[1]])

    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def predict(rows, id_list, directory):
    tags=["serve"]
    signature_def_key = "predict"
    saved_model_dir = directory
    holder = {}
    with Session() as sess:
        loader.load(sess, tags, saved_model_dir)
        for x in id_list:
            values = rows[x]
            serialized_examples = []
            for i in range(len(values)):
                serialized_example = serialize_example(values.iloc[i], subcolumns)
                serialized_examples.append(serialized_example)
            inputs_feed_dict = {'input_example_tensor:0': serialized_examples}
            outputs = sess.run('groupwise_dnn_v2/accumulate_scores/div_no_nan:0', feed_dict=inputs_feed_dict)
            output = [(outputs[y][0], values.iloc[y].voc_id) for y in range(len(outputs))]
            holder[x] = output
    return holder

def test_ranker(df, directory, threshold):
    tp = 0
    fp = 0
    fn1 = 0
    fn2 = 0
    tn = 0
    id_list = df.notary_id.unique()
    holder = {}
    for x in id_list:
        rows = df[df.notary_id == x]
        nil = df
        holder[x] = rows
    
    ranking = predict(holder, id_list, directory)
    
    for x in id_list:
        try:
            predicted_match = max(ranking[x])
        except:
            for y in ranking[x]:
                if y[1] == -1:
                    predicted_match=y

        if predicted_match[0] <= threshold:
            if df[df.notary_id == x].match.mean() > 0:
                fn1 += 1
            else:
                tn += 1
        else:
            target = df[(df.voc_id == predicted_match[1]) & (df.notary_id == x)]
            if target.match.iloc[0] == 1:
                tp += 1
            else:
                fp += 1
                if df[df.notary_id == x].match.mean() > 0:
                    fn2 += 1
                    
    recall = tp / (tp + (fn1 + fn2))
    if tp == 0:
        precision = 0
    else:
        precision = tp / (tp + fp)
    print('Recall: ' + str(recall))
    print('Precision: ' + str(precision))
    if precision == 0:
        print('F1: 0')
    else:
        print('F1: ' + str(2*((precision*recall) / (precision + recall))))
    return {'true_positives':tp, 
            'false_positives':fp, 
            'false_negatives_threshold':fn1, 
            'false_negatives_ranker':fn2, 
            'true_negatives':tn, 
            'recall':recall, 
            'precision':precision}
    

In [None]:
for x in range(-5, 6, 1):
    print('Threshold: ' +  str(x))
    print(test_ranker(test, 'LTR_models/listwise_combined3/export/1591020106', x))
    print('_________________________________________________________________________')

## Results

#### Pointwise LTR_models/pointwise/export/1590928427
Threshold: -2  
Recall: 0.8333333333333334  
Precision: 0.8333333333333334  
F1: 0.8333333333333334

#### Pointwise lower learning LTR_models/pointwise_lower_learning/export/1590936596
Threshold: -1  
Recall: 0.7777777777777778  
Precision: 0.7777777777777778  
F1: 0.7777777777777778

#### Pointwise higher dropout LTR_models/pointwise_higher_dropout/export/1590938159
Threshold: -2  
Recall: 0.7222222222222222  
Precision: 0.6842105263157895  
F1: 0.7027027027027027

#### Pointwise lower dropout LTR_models/pointwise_lower_dropout/export/1591009109
Threshold: 0  
Recall: 0.7692307692307693  
Precision: 1.0  
F1: 0.8695652173913044

#### Pointwise higher learning LTR_models/pointwise_higher_learning/export/1591010427
Threshold: -5  
Recall: 0.9230769230769231  
Precision: 0.75  
F1: 0.8275862068965517

#### Pointwise combined1  LTR_models/pointwise_combined1/export/1591011492
Threshold: -1  
Recall: 0.7692307692307693  
Precision: 0.9090909090909091  
F1: 0.8333333333333333

#### Pairwise LTR_models/pairwise/export/1590929599
Threshold: -1  
Recall: 0.6111111111111112  
Precision: 0.7857142857142857  
F1: 0.6875000000000001

#### Listwise LTR_models/listwise/export/1590930860
Threshold: 0  
Recall: 0.5555555555555556  
Precision: 0.8333333333333334  
F1: 0.6666666666666667  

#### Listwise lower learning LTR_models/listwise_lower_learning/export/1590932665
Threshold: 0  
Recall: 0.6111111111111112  
Precision: 0.9166666666666666  
F1: 0.7333333333333334

#### Listwise higher dropout  LTR_models/listwise_higher_dropout/export/1590934369
Threshold: 0  
Recall: 0.6111111111111112  
Precision: 0.8461538461538461  
F1: 0.7096774193548387  

#### Listwise combined LTR_models/listwise_combined/export/1590935654
Threshold: -1  
Recall: 0.7777777777777778  
Precision: 0.8235294117647058  
F1: 0.7999999999999999

#### Listwise lower dropout LTR_models/listwise_lower_dropout/export/1591013445
Threshold: 0  
Recall: 0.7222222222222222  
Precision: 0.8666666666666667  
F1: 0.7878787878787877

#### Listwise higher learning LTR_models/listwise_higher_learning/export/1591015344
Threshold: 0  
Recall: 0.6666666666666666  
Precision: 0.9230769230769231  
F1: 0.7741935483870968

#### Listwise combined2 LTR_models/listwise_combined2/export/1591016667
Threshold: -1  
Recall: 0.7222222222222222  
Precision: 0.8125  
F1: 0.7647058823529411

#### Listwise combined3 LTR_models/listwise_combined3/export/1591020106
Threshold: 0  
Recall: 0.6666666666666666  
Precision: 1.0  
F1: 0.8

## Cross Validation

In [8]:
kf = KFold(n_splits = 10, shuffle=True)
c = 0
for train_index, test_index in kf.split(df):
    c += 1
#     train = df[df.notary_id.isin(df.notary_id.unique()[train_index])]
#     test = df[df.notary_id.isin(df.notary_id.unique()[test_index])]
    train = df.loc[train_index]
    test = df.loc[test_index]
    
    file = open('ranking_crossvalidation/train_files/train' + str(c) + '.txt', 'w')
    for x in train.itertuples():
        line = str(x.match) + ' qid:' + str(x.notary_id)
        for y in enumerate(subcolumns):
            if getattr(x, y[1]) != 0:
                line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
        if x.Index + 1 == df.shape[0]:
            file.writelines(line)
        else:
            file.writelines(line + '\n')
    file.close()
    
    file = open('ranking_crossvalidation/train_files/test' + str(c) + '.txt', 'w')
    for x in test.itertuples():
        line = str(x.match) + ' qid:' + str(x.notary_id)
        for y in enumerate(subcolumns):
            if getattr(x, y[1]) != 0:
                line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
        if x.Index + 1 == df.shape[0]:
            file.writelines(line)
        else:
            file.writelines(line + '\n')
    file.close()
    
    file = open('ranking_crossvalidation/train_files/vali' + str(c) + '.txt', 'w')
    for x in test.itertuples():
        line = str(x.match) + ' qid:' + str(x.notary_id)
        for y in enumerate(subcolumns):
            if getattr(x, y[1]) != 0:
                line = line + ' ' + str(y[0] + 1) + ':' + str(getattr(x, y[1]))
        if x.Index + 1 == df.shape[0]:
            file.writelines(line)
        else:
            file.writelines(line + '\n')
    file.close()
    
    test.to_json('ranking_crossvalidation/train_files/test' + str(c) + '.json')

In [48]:
df[df.match == 1]['day_dif'].mean()

31.405940594059405

In [49]:
print(len(test[test.match == 1]))

23.75

In [10]:
test = pd.read_json('ranking_crossvalidation/train_files/test1.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold1/export/1591177438', 0))

10
Recall: 0.6
Precision: 0.6666666666666666
F1: 0.631578947368421
{'true_positives': 6, 'false_positives': 3, 'false_negatives_threshold': 4, 'false_negatives_ranker': 0, 'true_negatives': 126, 'recall': 0.6, 'precision': 0.6666666666666666}


In [11]:
test = pd.read_json('ranking_crossvalidation/train_files/test2.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold2/export/1591178227', 0))

11
Recall: 0.18181818181818182
Precision: 0.6666666666666666
F1: 0.28571428571428575
{'true_positives': 2, 'false_positives': 1, 'false_negatives_threshold': 9, 'false_negatives_ranker': 0, 'true_negatives': 129, 'recall': 0.18181818181818182, 'precision': 0.6666666666666666}


In [12]:
test = pd.read_json('ranking_crossvalidation/train_files/test3.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold3/export/1591179017', 0))

8
Recall: 0.75
Precision: 0.8571428571428571
F1: 0.7999999999999999
{'true_positives': 6, 'false_positives': 1, 'false_negatives_threshold': 2, 'false_negatives_ranker': 0, 'true_negatives': 130, 'recall': 0.75, 'precision': 0.8571428571428571}


In [14]:
test = pd.read_json('ranking_crossvalidation/train_files/test4.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold4/export/1591179797', 0))

4
Recall: 0.75
Precision: 0.6
F1: 0.6666666666666665
{'true_positives': 3, 'false_positives': 2, 'false_negatives_threshold': 1, 'false_negatives_ranker': 0, 'true_negatives': 134, 'recall': 0.75, 'precision': 0.6}


In [15]:
test = pd.read_json('ranking_crossvalidation/train_files/test5.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold5/export/1591180581', 0))

11
Recall: 0.7272727272727273
Precision: 0.8888888888888888
F1: 0.7999999999999999
{'true_positives': 8, 'false_positives': 1, 'false_negatives_threshold': 3, 'false_negatives_ranker': 0, 'true_negatives': 130, 'recall': 0.7272727272727273, 'precision': 0.8888888888888888}


In [16]:
test = pd.read_json('ranking_crossvalidation/train_files/test6.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold6/export/1591181359', 0))

10
Recall: 0.5
Precision: 0.8333333333333334
F1: 0.625
{'true_positives': 5, 'false_positives': 1, 'false_negatives_threshold': 5, 'false_negatives_ranker': 0, 'true_negatives': 129, 'recall': 0.5, 'precision': 0.8333333333333334}


In [24]:
test = pd.read_json('ranking_crossvalidation/train_files/test7.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold7/export/1591098103', 0))

9
Recall: 0.8888888888888888
Precision: 0.7272727272727273
F1: 0.7999999999999999
{'true_positives': 8, 'false_positives': 3, 'false_negatives_threshold': 1, 'false_negatives_ranker': 0, 'true_negatives': 65, 'recall': 0.8888888888888888, 'precision': 0.7272727272727273}


In [23]:
test = pd.read_json('ranking_crossvalidation/train_files/test8.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold8/export/1591098103', 0))

2


OSError: SavedModel file does not exist at: ranking_crossvalidation/kfold8/export/1591098103/{saved_model.pbtxt|saved_model.pb}

In [16]:
test = pd.read_json('ranking_crossvalidation/train_files/test9.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold9/export/1591035541', 0))

Recall: 0.6363636363636364
Precision: 1.0
F1: 0.7777777777777778
[7, 0, 4, 0, 73, 0.6363636363636364, 1.0]


In [17]:
test = pd.read_json('ranking_crossvalidation/train_files/test10.json')
print(len(test[test.match == 1]))
print(test_ranker(test, 'ranking_crossvalidation/kfold10/export/1591036489', 0))

Recall: 0.5384615384615384
Precision: 0.7777777777777778
F1: 0.6363636363636364
[7, 2, 6, 0, 69, 0.5384615384615384, 0.7777777777777778]
