# 4.- Model evaluation

## Notebook setup

In [1]:
import os
import pandas as pd
import pickle

from tqdm.auto import tqdm

In [2]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import FOAF, RDF, RDFS

geo = Namespace("http://www.opengis.net/ont/geosparql#")
uo = Namespace("https://purl.org/uniovi/wd-edit-history#")
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
wdno = Namespace("http://www.wikidata.org/prop/novalue/")

In [3]:
REFINEMENT_SYSTEMS_OUTPUT_DIR = os.path.join('output', '3_edit_history_systems')
TRAINED_MODELS_DIR = os.path.join('output', '3_edit_history_systems', 'models')
OUTPUT_DIR = os.path.join('output', '4_model_evaluation')

RANDOM_SEED = 42

WIKIDATA_BASE = "https://www.wikidata.org/w/api.php"

In [4]:
import logging

log_format = "%(levelname)s %(asctime)s - %(message)s"
logging.basicConfig(filename = "evaluation_logs.log",
                    filemode = "w",
                    format = log_format, 
                    level = logging.INFO)
logger = logging.getLogger()

In [5]:
def load_df(filename):
    with open(os.path.join(OUTPUT_DIR, f"{filename}.pkl"), 'rb') as f:
        return pickle.load(f)

def save_df(df, filename):
    with open(os.path.join(OUTPUT_DIR, f"{filename}.pkl"), 'wb') as f:
        pickle.dump(df, f)

In [6]:
train_graph_static = Graph().parse(os.path.join(REFINEMENT_SYSTEMS_OUTPUT_DIR, 'complete_train_graph_static.ttl'), format='ttl')
test_graph_static = Graph().parse(os.path.join(REFINEMENT_SYSTEMS_OUTPUT_DIR, 'test_graph_sample_static.ttl'), format='ttl')

In [7]:
import pdb
import numpy as np
import torch

from pykeen.triples import TriplesFactory

known_entities = set()
known_relations = set()

def entity_hashed_id(entity_uri):
    return hash(entity_uri.split('/')[-1])

def build_triples_factory_train(graph):
    triples = []
    for t in graph:
        if not t[2].startswith(str(wdt)) and not t[2].startswith(str(wd)):
            continue
            
        triple_str = [str(t[0]), str(t[1]), str(t[2])]
        if triple_str[0] not in known_entities:
            known_entities.add(triple_str[0])
        
        if triple_str[1] not in known_relations:
            known_relations.add(triple_str[1])
        
        if triple_str[2] not in known_entities:
            known_entities.add(triple_str[2])

        triples.append(triple_str)
    return TriplesFactory.from_labeled_triples(np.array(triples))

def build_triples_factory_test(graph, train_tf):
    triples = []
    for t in graph:
        if not t[2].startswith(str(wdt)) and not t[2].startswith(str(wd)):
            continue
        
        triple_str = [str(t[0]), str(t[1]), str(t[2])]
        if triple_str[0] not in known_entities or triple_str[1] not in known_relations or triple_str[2] not in known_entities:
            continue
        triples.append(triple_str)

    return TriplesFactory.from_labeled_triples(np.array(triples), entity_to_id=train_tf.entity_to_id, relation_to_id=train_tf.relation_to_id)

tf_train = build_triples_factory_train(train_graph_static)
tf_test = build_triples_factory_test(test_graph_static, tf_train)

## Defining the evaluation metrics

In [8]:
def hits_at_n(ranks, n):
    return sum([1 if r <= n else 0 for r in ranks]) / len(ranks)

def mr_score(ranks):
    mr_score = 0
    for r in ranks:
        mr_score += r
    return mr_score / len(ranks)

def mrr_score(ranks):
    mr_score = 0
    for r in ranks:
        mr_score += 1 / r
    return mr_score / len(ranks)

## Testing functions

In [9]:
import pdb

def get_testing_entities(test_graph_static, prop = None):
    # get all entities that have a new P31 value in the test data
    entities_test = set()
    for t in test_graph_static.triples((None, prop, None)):
        entities_test.add((str(t[0]), str(t[1]), str(t[2])))
    return entities_test

In [16]:
def test_supervised(model, entities_test, entity_2_embeddings, all_classes):
    ranks = []
    misses = 0
    _all = 0
    
    # generic model useful for any property, embeddings = subj + prop + obj
    embedding_size = len(list(entity_2_embeddings.values())[0])
    X = np.zeros((len(all_classes), embedding_size * 3))
    for i, kg_class in enumerate(all_classes):
        X[i, 2*embedding_size:] = entity_2_embeddings[kg_class]
        
    
    for entity, prop, true_class in tqdm(entities_test):
        _all += 1

        X[:, :embedding_size] = entity_2_embeddings[entity]
        X[:, embedding_size:2*embedding_size] = entity_2_embeddings[prop]

        pred = model.predict_proba(X)
        entity_results = [(kg_class, pred[idx][1]) for idx, kg_class in enumerate(all_classes)]
        entity_results.sort(key=lambda item: item[1], reverse=True)
        sorted_predictions = [e[0] for e in entity_results]
        if true_class not in sorted_predictions:
            misses += 1
            continue
        idx = sorted_predictions.index(true_class)
        ranks.append(idx + 1)
    return ranks, misses, _all

In [11]:
from pykeen.models import predict

def test_unsupervised(model, entities_test, triples_factory, train_graph_static):
    ranks = []
    misses = 0
    _all = 0
    for entity, prop, true_class in tqdm(entities_test):
        _all += 1
        if entity not in triples_factory.entity_to_id or prop not in triples_factory.relation_to_id:
            misses += 1
            continue
        
        pred_df = predict.get_tail_prediction_df(model, entity, str(prop), triples_factory=triples_factory, add_novelties=False)
        pred_rank = 1
        for row in pred_df.itertuples():
            if row[2] == true_class:
                ranks.append(pred_rank)
                break
            else:
                # manual filtering of known triples
                if (URIRef(entity), URIRef(prop), URIRef(row[2])) not in train_graph_static:
                    pred_rank += 1
    return ranks, misses, _all

In [12]:
def test_sdtype(sdtype_result_graph, entities_test, train_graph_static):
    ranks = []
    misses = 0
    _all = 0
    for entity, prop, true_class in tqdm(entities_test):
        _all += 1
        if _all == 100:
            break
        
        preds = list(sdtype_result_graph.predicate_objects(subject=URIRef(entity)))
        print(len(preds))
        if len(preds) == 0:
            misses += 1
            continue
        
        preds.sort(key=lambda x: float(x[1]), reverse=True)
        pred_rank = 1
        for obj, _ in preds:
            if str(obj) == true_class:
                ranks.append(pred_rank)
                break
            else:
                # manual filtering of known triples
                if (URIRef(entity), URIRef(prop), obj) not in train_graph_static:
                    pred_rank += 1
    return ranks, misses, _all

## Evaluating the models

In [13]:
props_to_evaluate = [wdt.P31,
                     wdt.P106, wdt.P27, # human: occupation; country of citizenship
                     wdt.P36, wdt.P30, wdt.P37, wdt.P131, # 'places': capital; continent; legislative body; official language; located in
                     wdt.P57, wdt.P136 # literary work: director; genre; author
                    ]

In [37]:
for prop in props_to_evaluate:
    count = len(list(test_graph_static.triples((None, prop, None))))
    print(f"{str(prop)} - count={count}")

http://www.wikidata.org/prop/direct/P31 - count=4186


### Unsupervised models

In [None]:

data = {
    'model': [],
    'sampler': [],
    'prop': [],
    'MR': [],
    'MRR': [],
    'hits@1': [],
    'hits@5': [],
    'hits@10': []
}

unsupervised_model_directories = [f for f in os.listdir(TRAINED_MODELS_DIR) if f != 'supervised' and os.path.isdir(os.path.join(TRAINED_MODELS_DIR, f))]

for prop in props_to_evaluate:
    logger.info(str(prop))
    test_entities = get_testing_entities(test_graph_static, prop)
    for model_dir in unsupervised_model_directories:
        logger.info(model_dir)
        samplers_directories = [f for f in os.listdir(os.path.join(TRAINED_MODELS_DIR, model_dir)) if os.path.isdir(os.path.join(TRAINED_MODELS_DIR, model_dir, f))]
        for sampler in samplers_directories:
            logger.info(sampler)
            path = os.path.join(TRAINED_MODELS_DIR, model_dir, sampler)
            with open(os.path.join(path, 'model.pkl'), 'rb') as f:
                model = torch.load(f, map_location=torch.device('cpu'))
            ranks, misses, _all = test_unsupervised(model, test_entities, tf_train, train_graph_static)
            data['model'].append(model_dir)
            data['sampler'].append(sampler)
            data['prop'].append(str(prop))
            data['MR'].append(mr_score(ranks))
            data['MRR'].append(mrr_score(ranks))
            data['hits@1'].append(hits_at_n(ranks, 1))
            data['hits@5'].append(hits_at_n(ranks, 5))
            data['hits@10'].append(hits_at_n(ranks, 10))
    logger.info("\n\n")

evaluation_unsupervised_df = pd.DataFrame(data)
save_df(evaluation_unsupervised_df, 'evaluation_unsupervised')

  0%|          | 0/32 [00:00<?, ?it/s]



  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

  0%|          | 0/820 [00:00<?, ?it/s]

  0%|          | 0/820 [00:00<?, ?it/s]

  0%|          | 0/820 [00:00<?, ?it/s]

In [72]:
evaluation_unsupervised_df = load_df('evaluation_unsupervised')
evaluation_unsupervised_df.head(n=12)

Unnamed: 0,model,sampler,prop,MR,MRR,hits@1,hits@5,hits@10
0,RotatE,EditHistoryNegativeSampler,http://www.wikidata.org/prop/direct/P106,17554.5,0.116244,0.071429,0.142857,0.142857
1,RotatE,InverseEditHistoryNegativeSampler,http://www.wikidata.org/prop/direct/P106,23370.035714,0.127579,0.071429,0.178571,0.178571
2,RotatE,EditHistoryNegativeSampler (no edit wars),http://www.wikidata.org/prop/direct/P106,25222.464286,0.122226,0.107143,0.142857,0.142857
3,RotatE,BasicNegativeSampler,http://www.wikidata.org/prop/direct/P106,24825.178571,0.118798,0.071429,0.142857,0.178571
4,TransE,EditHistoryNegativeSampler,http://www.wikidata.org/prop/direct/P106,7109.714286,0.097682,0.071429,0.107143,0.214286
5,TransE,InverseEditHistoryNegativeSampler,http://www.wikidata.org/prop/direct/P106,12258.5,0.085655,0.0,0.178571,0.25
6,TransE,EditHistoryNegativeSampler (no edit wars),http://www.wikidata.org/prop/direct/P106,10751.964286,0.128833,0.071429,0.178571,0.178571
7,TransE,BasicNegativeSampler,http://www.wikidata.org/prop/direct/P106,22702.321429,0.021345,0.0,0.035714,0.035714
8,MuRE,EditHistoryNegativeSampler,http://www.wikidata.org/prop/direct/P106,14729.821429,0.054689,0.0,0.071429,0.107143
9,MuRE,InverseEditHistoryNegativeSampler,http://www.wikidata.org/prop/direct/P106,10875.642857,0.132228,0.107143,0.107143,0.25


In [73]:
evaluation_unsupervised_df.to_csv(os.path.join(OUTPUT_DIR, 'evaluation_unsupervised_2.csv'), sep=',', index=False)

### Supervised model

In [14]:
with open(os.path.join(TRAINED_MODELS_DIR, 'supervised', 'model.pkl'), 'rb') as f:
    supervised_model = pickle.load(f)

with open(os.path.join(TRAINED_MODELS_DIR, 'supervised', 'embeddings.pkl'), 'rb') as f: 
    entity_2_embeddings = pickle.load(f)

In [None]:
data = {
    'prop': [],
    'MR': [],
    'MRR': [],
    'hits@1': [],
    'hits@5': [],
    'hits@10': []
}

possible_entities = list(set([str(s) for s in train_graph_static.subjects(None, None) if isinstance(s, URIRef)] + \
                             [str(s) for s in train_graph_static.objects(None, None) if isinstance(s, URIRef)]))

for prop in props_to_evaluate:
    logger.info(str(prop))
    entities_test = get_testing_entities(test_graph_static, prop)
    ranks, misses, _all = test_supervised(supervised_model, entities_test, entity_2_embeddings, possible_entities)
    if len(ranks) > 0:
        data['prop'].append(str(prop))
        data['MR'].append(mr_score(ranks))
        data['MRR'].append(mrr_score(ranks))
        data['hits@1'].append(hits_at_n(ranks, 1))
        data['hits@5'].append(hits_at_n(ranks, 5))
        data['hits@10'].append(hits_at_n(ranks, 10))
    
evaluation_supervised_df = pd.DataFrame(data)
save_df(evaluation_supervised_df, 'evaluation_supervised')

  0%|          | 0/4186 [00:00<?, ?it/s]

In [31]:
evaluation_supervised_df = load_df('evaluation_supervised')
evaluation_supervised_df.head()

Unnamed: 0,prop,MR,MRR,hits@1,hits@5,hits@10
0,http://www.wikidata.org/prop/direct/P31,13923.703226,0.111846,0.045161,0.165591,0.247312


In [19]:
evaluation_supervised_df.to_csv(os.path.join(OUTPUT_DIR, 'evaluation_supervised.csv'), sep=',', index=False)

### SDType model

In [55]:
sdtype_graph = Graph().parse(os.path.join(TRAINED_MODELS_DIR, 'sdtype_final_output.ttl'), format='ttl')

In [69]:
data = {
    'MR': [],
    'MRR': [],
    'hits@1': [],
    'hits@5': [],
    'hits@10': []
}

entities_test = get_testing_entities(test_graph_static, wdt.P31)
ranks, misses, _all = test_sdtype(sdtype_graph, entities_test, train_graph_static)
data['MR'].append(mr_score(ranks))
data['MRR'].append(mrr_score(ranks))
data['hits@1'].append(hits_at_n(ranks, 1))
data['hits@5'].append(hits_at_n(ranks, 5))
data['hits@10'].append(hits_at_n(ranks, 10))

evaluation_sdtype_df = pd.DataFrame(data)
save_df(evaluation_sdtype_df, 'evaluation_sdtype')

  0%|          | 0/4186 [00:00<?, ?it/s]

1105
1158
960
380
285
1359
1228
1102
1419
1410
1079
1158
1093
1458
1158
1158
1110
1080
1433
1441
1275
1470
1110
1359
1110
1079
981
1470
1091
379
1423
1103
1091
1433
1241
1471
1158
275
1158
379
1158
1483
1103
1134
1205
1110
647
1465
1417
1158
1205
699
1110
1442
1051
285
1064
1448
1080
1157
379
956
76
1054
1158
1110
1231
1158
304
1079
1336
1110
722
1095
1054
1411
205
420
1205
1433
1110
1314
1415
1506
1158
1336
1241
353
1433
1110
1458
266
204
258
1171
1433
1110
1087
1079


In [65]:
evaluation_sdtype_df = load_df('evaluation_sdtype')
evaluation_sdtype_df.head()

Unnamed: 0,MR,MRR,hits@1,hits@5,hits@10
0,62.683266,0.303276,0.202142,0.401071,0.517537
