# 4.- Model evaluation

## Notebook setup

In [1]:
import os

from tqdm.auto import tqdm

In [2]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import FOAF, RDF, RDFS

geo = Namespace("http://www.opengis.net/ont/geosparql#")
uo = Namespace("https://purl.org/uniovi/wd-edit-history#")
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
wdno = Namespace("http://www.wikidata.org/prop/novalue/")

In [3]:
REFINEMENT_SYSTEMS_OUTPUT_DIR = os.path.join('output', '3_edit_history_systems')
OUTPUT_DIR = os.path.join('output', '4_model_evaluation')

RANDOM_SEED = 42

WIKIDATA_BASE = "https://www.wikidata.org/w/api.php"

In [4]:
def load_df(filename):
    with open(os.path.join(OUTPUT_DIR, f"{filename}.pkl"), 'rb') as f:
        return pickle.load(f)

def save_df(df, filename):
    with open(os.path.join(OUTPUT_DIR, f"{filename}.pkl"), 'wb') as f:
        pickle.dump(df, f)

In [7]:
train_graph_static = Graph().parse(os.path.join(REFINEMENT_SYSTEMS_OUTPUT_DIR, 'complete_train_graph_static.ttl'), format='ttl')
test_graph_static = Graph().parse(os.path.join(REFINEMENT_SYSTEMS_OUTPUT_DIR, 'test_graph_sample_static.ttl'), format='ttl')

In [8]:
import pdb
import numpy as np
import torch

from pykeen.triples import TriplesFactory

known_entities = set()
known_relations = set()

def entity_hashed_id(entity_uri):
    return hash(entity_uri.split('/')[-1])

def build_triples_factory_train(graph):
    triples = []
    for t in graph:
        if not t[2].startswith(str(wdt)) and not t[2].startswith(str(wd)):
            continue
            
        triple_str = [str(t[0]), str(t[1]), str(t[2])]
        if triple_str[0] not in known_entities:
            known_entities.add(triple_str[0])
        
        if triple_str[1] not in known_relations:
            known_relations.add(triple_str[1])
        
        if triple_str[2] not in known_entities:
            known_entities.add(triple_str[2])

        triples.append(triple_str)
    return TriplesFactory.from_labeled_triples(np.array(triples))

def build_triples_factory_test(graph, train_tf):
    triples = []
    for t in graph:
        if not t[2].startswith(str(wdt)) and not t[2].startswith(str(wd)):
            continue
        
        triple_str = [str(t[0]), str(t[1]), str(t[2])]
        if triple_str[0] not in known_entities or triple_str[1] not in known_relations or triple_str[2] not in known_entities:
            continue
        triples.append(triple_str)

    return TriplesFactory.from_labeled_triples(np.array(triples), entity_to_id=train_tf.entity_to_id, relation_to_id=train_tf.relation_to_id)

tf_train = build_triples_factory_train(train_graph_static)
tf_test = build_triples_factory_test(test_graph_static, tf_train)

## Defining the evaluation metrics

In [9]:
def hits_at_n(ranks, n):
    return sum([1 if r <= n else 0 for r in ranks]) / len(ranks)

def mr_score(ranks):
    mr_score = 0
    for r in ranks:
        mr_score += r
    return mr_score / len(ranks)

def mrr_score(ranks):
    mr_score = 0
    for r in ranks:
        mr_score += 1 / r
    return mr_score / len(ranks)

## Testing functions

In [9]:
import pdb

def get_testing_entities(test_graph_static, prop = None):
    # get all entities that have a new P31 value in the test data
    entities_test = set()
    for t in test_graph_static.triples((None, prop, None)):
        entities_test.add((str(t[0]), str(t[1]), str(t[2])))
    return entities_test

In [10]:
def test_supervised(model, entities_test, entity_2_embeddings, all_classes):
    ranks = []
    misses = 0
    _all = 0
    
    # generic model useful for any property, embeddings = subj + prop + obj
    embedding_size = len(list(entity_2_embeddings.values())[0])
    X = np.zeros((len(all_classes), embedding_size * 3))
    for i, kg_class in enumerate(all_classes):
        X[i, 2*embedding_size:] = entity_2_embeddings[kg_class]
        
    
    for entity, prop, true_class in tqdm(entities_test):
        _all += 1

        X[:, :embedding_size] = entity_2_embeddings[entity]
        X[:, embedding_size:2*embedding_size] = entity_2_embeddings[prop]

        pred = model.predict_proba(X)
        entity_results = [(kg_class, pred[idx][1]) for idx, kg_class in enumerate(all_classes)]
        entity_results.sort(key=lambda item: item[1], reverse=True)
        sorted_predictions = [e[0] for e in entity_results]
        if true_class not in sorted_predictions:
            misses += 1
            continue
        idx = sorted_predictions.index(true_class)
        ranks.append(idx + 1)
    return ranks, misses, _all

In [29]:
from pykeen.models import predict

def test_unsupervised(model, entities_test, triples_factory, train_graph_static):
    ranks = []
    misses = 0
    _all = 0
    for entity, prop, true_class in tqdm(entities_test):
        _all += 1
        if entity not in triples_factory.entity_to_id or prop not in triples_factory.relation_to_id:
            misses += 1
            continue
        
        pred_df = predict.get_tail_prediction_df(model, entity, str(prop), triples_factory=triples_factory, add_novelties=False)
        pred_rank = 1
        for row in pred_df.itertuples():
            if row[2] == true_class:
                ranks.append(pred_rank)
                break
            else:
                # manual filtering of known triples
                if (URIRef(entity), URIRef(prop), URIRef(row[2])) not in train_graph_static:
                    pred_rank += 1
    return ranks, misses, _all

## Evaluating the models

In [None]:
props_to_evaluate = [wdt.P31]

### Unsupervised models

In [13]:
import torch

with open('model.pkl', 'rb') as f:
    model = torch.load(f, map_location=torch.device('cpu'))

In [30]:
ranks, misses, _all = test_system_unsupervised(model, test_entities, tf_train, train_graph_static)

  0%|          | 0/4186 [00:00<?, ?it/s]

MR score: 10819.282152230971
MRR score: 0.1774430164555107
hits@1: 0.08582677165354331
hits@5: 0.3
hits@10: 0.38188976377952755
9
4186


In [15]:
ranks[:5]

[1319, 190, 3846, 5, 13]

In [None]:
data = {
    'model': [],
    'sampler': [],
    'prop': [],
    'MR': [],
    'MRR': [],
    'hits@1': [],
    'hits@5': [],
    'hits@10': []
}

TRAINED_MODELS_DIR = os.path.join(REFINEMENT_SYSTEMS_OUTPUT_DIR, 'models')
model_directories = [f for f in os.listdir(TRAINED_MODELS_DIR) if os.path.isdir(os.path.join(TRAINED_MODELS_DIR, f))]

for prop in props_to_evaluate:
    logger.info(str(prop))
    test_entities = get_testing_entities(test_graph_static, prop)
    for model_dir in model_directories:
        logger.info(model_dir)
        samplers_directories = [f for f in os.listdir(os.path.join(TRAINED_MODELS_DIR, model_dir)) if os.path.isdir(os.path.join(TRAINED_MODELS_DIR, model_dir, f))]
        for sampler in samplers_directories:
            logger.info(sampler)
                path = os.path.join(MODEL_DIR, model_dir, sampler)
                with open(os.path.join(path, 'model.pkl'), 'rb') as f:
                    model = torch.load(f, map_location=torch.device('cpu'))
                ranks, misses, _all = test_system_unsupervised(model, test_entities, tf_train, train_graph_static)
                data['model'].append(model_dir)
                data['sampler'].append(sampler)
                data['prop'].append(str(prop))
                data['MR'].append(mr_score(ranks))
                data['MRR'].append(mrr_score(ranks))
                data['hits@1'].append(hits_at_n(ranks, 1))
                data['hits@5'].append(hits_at_n(ranks, 5))
                data['hits@10'].append(hits_at_n(ranks, 10))
    logger.info("\n\n")

evaluation_unsupervised_df = pd.DataFrame(data)
save_df(evaluation_unsupervised_df, 'evaluation_unsupervised')

In [None]:
evaluation_unsupervised_df = load_df('evaluation_unsupervised')
evaluation_unsupervised_df.head()

### Supervised model

In [None]:
with open(os.path.join(TRAINED_MODELS_DIR, 'supervised', 'model.pkl'), 'rb') as f:
    supervised_model = pickle.load(f)

with open(os.path.join(TRAINED_MODELS_DIR, 'supervised', 'embeddings.pkl'), 'rb') as f: 
    entity_2_embeddings = pickle.dump(f)

In [None]:
data = {
    'prop': [],
    'MR': [],
    'MRR': [],
    'hits@1': [],
    'hits@5': [],
    'hits@10': []
}

possible_entities = list(set([str(s) for s in train_graph_static.subjects(None, None) if isinstance(s, URIRef)] + \
                             [str(s) for s in train_graph_static.objects(None, None) if isinstance(s, URIRef)]))

for prop in props_to_evaluate:
    logger.info(str(prop))
    entities_test = get_testing_entities(test_graph_static, prop)
    ranks, misses, _all = test_supervised(supervised_model, entities_test, entity_2_embeddings, possible_entities)
    data['prop'].append(str(prop))
    data['MR'].append(mr_score(ranks))
    data['MRR'].append(mrr_score(ranks))
    data['hits@1'].append(hits_at_n(ranks, 1))
    data['hits@5'].append(hits_at_n(ranks, 5))
    data['hits@10'].append(hits_at_n(ranks, 10))
    
evaluation_supervised_df = pd.DataFrame(data)
save_df(evaluation_supervised_df, 'evaluation_supervised')

In [None]:
evaluation_supervised_df = load_df('evaluation_supervised')
evaluation_supervised_df.head()