# 3. Refinement Systems
In this notebook we are going to experiment with a new camera I am trying out xDDDDD

## Notebook setup

### Imports

In [119]:
import json
import math
import os
import pickle
import random
import requests

from collections import defaultdict

In [120]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from tqdm.auto import tqdm

op_types = ['add', 'remove', 'replace']
op_colors = {'add': px.colors.qualitative.Plotly[0], 'remove': px.colors.qualitative.Plotly[1], 'replace': px.colors.qualitative.Plotly[2]}
pio.templates.default = "plotly_white"

### Loading top classes data

In [285]:
DATA_DIR = os.path.join('..', 'data')

NOTEBOOK1_OUTPUT_DIR = os.path.join('output', '1_data_fetching')
CLASSES_FILE = os.path.join(NOTEBOOK1_OUTPUT_DIR, 'top_classes.pkl')

OUTPUT_DIR = os.path.join('output', '3_edit_history_systems')

RANDOM_SEED = 42

WIKIDATA_BASE = "https://www.wikidata.org/w/api.php"

In [125]:
from dataclasses import dataclass
from typing import List

@dataclass
class KGEntity:
    qid: str
    pagerank_score: float

@dataclass
class KGClass:
    name: str
    qid: str
    classrank_score: float
    instances: List[KGEntity]


In [274]:
with open(CLASSES_FILE, 'rb') as f:
    top_classes = pickle.load(f)

### Connecting to the database
We will now make a connection to the Mongo database where this data is stored. If everything was set up from the docker-compose file, this connection will be configured automatically:

In [7]:
import getpass
import os
import pprint


def env_or_callback(env_name, callback, *args):
    return os.getenv(env_name) if os.getenv(env_name) is not None else callback(*args)

In [8]:
from pymongo import MongoClient
import pymongo

MONGO_USERNAME = env_or_callback("MONGO_USERNAME", input, "Mongo username: ")
MONGO_PASSWORD = env_or_callback("MONGO_PASSWORD", getpass.getpass, "Mongo password: ")
MONGO_URL = env_or_callback("MONGO_URL", input, "Mongo url: ")
MONGO_DATABASE = "wd_diff"

def get_database():
    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f"mongodb://{MONGO_USERNAME}:{MONGO_PASSWORD}@127.0.0.1:27017/{MONGO_DATABASE}"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client[MONGO_DATABASE]

In [9]:
db = get_database()
wd_entities = db.wd_entities
wd_revisions = db.wd_revisions

In [163]:
PURE_PROPERTY_EDIT_REGEX = "\/claims\/P([0-9]*)$"
PROPERTY_STATEMENTS_EDIT_REGEX = "^(?!.*(\/hash|[0-9]\/id|\/references|\/qualifiers))\/claims(\/P([0-9]*))?.*$"
PROPERTY_REFERENCES_EDIT_REGEX = "^(?!.*(\/hash|[0-9]\/id))\/claims\/P([0-9]*)\/[0-9]*\/references.*$"
PROPERTY_QUALIFIERS_EDIT_REGEX = "^(?!.*(\/hash|[0-9]\/id))\/claims\/P([0-9]*)\/[0-9]*\/qualifiers.*$"
PROPERTY_ANY_EDIT_REGEX = "^(?!.*(\/hash|[0-9]\/id))\/claims(\/P([0-9]*))?.*$"

In [71]:
def get_ops_of_entity(collection, entity_id, path_regex, match_prop=True):
    pipeline = [
        {"$match": {"entity_id": entity_id}},
        {"$project": {"entity_diff": 1, "entity_id": 1, "id": 1, "timestamp": 1}},
        {"$sort": {"borough": 1}},
        {"$unwind": "$entity_diff"},
        {"$match": {"entity_diff.path": {"$regex": path_regex}}}
    ]
    
    if match_prop:
        pipeline.append({"$addFields": {"prop": {"$regexFind": {"input": "$entity_diff.path", "regex": "P([0-9]*)"}}}})

    return list(collection.aggregate(pipeline))

## Building the static entity rdf dataset

In [238]:
import jsonpatch
import pdb

def rebuild_entity_statements(qid, revisions_percentage=0.8):
    entity_ops = get_ops_of_entity(wd_revisions, qid, PROPERTY_STATEMENTS_EDIT_REGEX, match_prop=False)
    if len(entity_ops) < 10:
        return None
    
    split_idx = round(len(entity_ops) * revisions_percentage)
    
    all_diffs = [op['entity_diff'] for op in entity_ops[:split_idx]]
    for diff in all_diffs:
        if diff['path'] == '/claims' and diff['op'] == 'add' and diff['value'] == []:
            diff['value'] = {}
    patch = jsonpatch.JsonPatch(all_diffs)
    return patch.apply({"claims": {}})

In [109]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import FOAF, RDF, RDFS

geo = Namespace("http://www.opengis.net/ont/geosparql#")
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
wdno = Namespace("http://www.wikidata.org/prop/novalue/")

In [253]:
def parse_wikibase_entityid(val):
    if val['entity-type'] == 'item':
        return wd[f"Q{val['numeric-id']}"]
    elif val['entity-type'] == 'property':
        return wdt[f"P{val['numeric-id']}"]
    else:
        return Literal(val)

datavalue_to_rdf = {
    'string': lambda val: Literal(val),
    'wikibase-entityid': lambda val: parse_wikibase_entityid(val),
    'globecoordinate': lambda val: Literal(f"{val['latitude']},{val['longitude']}", datatype=geo.wktLiteral),
    'commonsMedia': lambda val: Literal(val),
    'url': lambda val: Literal(val),
    'external-id': lambda val: Literal(val),
    'monolingualtext': lambda val: Literal(val['text'], lang=val['language']),
    'quantity': lambda val: Literal(float(val['amount'])),
    'time': lambda val: Literal(val['time'])
}

def snak_json_to_simple_object(snak_json, prop_id):
    snak_type = snak_json['snaktype']
    if snak_type == 'novalue':
        return wdno[prop_id]
    elif snak_type == 'somevalue':
        return BNode()
    else:
        # snak has a value
        if 'datavalue' in snak_json:
            snak_datatype = snak_json['datavalue']['type']
            snak_datavalue = snak_json['datavalue']['value']

            return datavalue_to_rdf[snak_datatype](snak_datavalue)
        else:
            return BNode()

def entity_json_to_rdf(entity_qid, entity_json):
    graph = Graph()
    graph.add((wd[entity_qid], RDF.type, wd.entity))
    
    if 'claims' in entity_json:
        for prop_id, claims in entity_json['claims'].items():
            for claim in claims:
                snak_obj = snak_json_to_simple_object(claim['mainsnak'], prop_id)
                graph.add((wd[entity_qid], wdt[prop_id], snak_obj))
    return graph


In [302]:
num_entities = 0
for kg_class in top_classes:
    num_entities += max(2000, round(len(kg_class.instances) * 0.1))

num_entities

1089219

In [312]:
def build_static_rdf_dataset(train_rev_split=0.8, num_entities_per_class=lambda num_instances: max(2000, round(num_instances * 0.1))):
    train_graph = Graph()
    test_graph = Graph()
    graphs = [train_graph, test_graph]

    for g in graphs:
        g.bind("geo", geo)
        g.bind("wd", wd)
        g.bind("wdt", wdt)
        g.bind("wdno", wdno)

    rng = random.Random(RANDOM_SEED)
    for kg_class in tqdm(top_classes):
        for g in graphs:
            g.add((wd[kg_class.qid], RDF.type, wd.kg_class))
        
        rng.shuffle(kg_class.instances)
        end_idx = num_entities_per_class(len(kg_class.instances))
        for entity in tqdm(kg_class.instances[:end_idx]):
            entity_qid = entity.qid
            rebuilt_train_entity = rebuild_entity_statements(entity_qid, revisions_percentage=train_rev_split)
            rebuilt_complete_entity = rebuild_entity_statements(entity_qid, revisions_percentage=1.0)
            if rebuilt_train_entity is not None:
                train_entity_graph = entity_json_to_rdf(entity_qid, rebuilt_train_entity)
                complete_entity_graph = entity_json_to_rdf(entity_qid, rebuilt_complete_entity)
                
                train_graph += train_entity_graph
                test_graph += (complete_entity_graph - train_graph)

    return train_graph, test_graph


In [313]:
train_graph, test_graph = build_static_rdf_dataset(train_rev_split=0.7, num_entities_per_class=lambda num_instances: 5000)

train_graph.serialize(destination=os.path.join(OUTPUT_DIR, 'train_graph_sample.ttl'))
test_graph.serialize(destination=os.path.join(OUTPUT_DIR, 'test_graph_sample.ttl'))

  0%|          | 0/91 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

KeyboardInterrupt: 

Quick check to see how many new 'instance of' values have appeared:

In [None]:
diff_graph = complete_graph - train_graph

In [None]:
num_new_instance_of = 0

for t in diff_graph.triples((None, wdt.P31, None)):
    num_new_instance_of += 1
num_new_instance_of

**TODO: maybe we should remove from the dataset entities without new instance of data in the test dataset**

**Option 2: we keep those entities but don't take them into account when evaluating the model**

## Building the dynamic entity rdf dataset

In [None]:
def entity_rev_data_to_rdf():
    pass


In [None]:
def build_dynamic_rdf_dataset():
    pass


## Testing zone

Clasificador:
* Entrada:
    * 'Representación' de la entidad (e.g. embedding con info temporal o sin ella dependiendo del clasificador).
    * 'Representación' de la propiedad.
    * Tipo de operación nueva.
    * Path de la operación.
    * Valor nuevo de la operación.
* Salida:
    * Se acepta o no (0/1).
    * Dependiendo del clasificador, umbral de confianza.
    
Podríamos tener los siguientes clasificadores:
* a) Baseline: utilizando solo info de la entidad actual (sin edit history).
* b) El mismo que antes, pero utilizando información del historial de ediciones. Este se podría dividir en varios en función de la información utilizada (p.ej. uno que analiza el historial global vs otro que utiliza info de cada decil).
* c) Mirando si esa operación fue eliminada antes (lo rechaza directamente), y si no lo pasa a uno de los clasificadores anteriores.

Podríamos probar las siguientes cosas:
* Clasificadores de antes con la info hasta noviembre (lo que tenemos indexado), y ver el rendimiento en las operaciones que pasaron desde entonces hasta 1 de marzo p.ej.
* Clasificadores con historial de ediciones. Vamos viendo como mejora el rendimiento cuanta mas info tenemos (p.ej. tiramos solo del 20% de ediciones para predecir lo siguiente, luego del 40%, del 60%...).
* Diferencia en los resultados entre clases (en cuales hay mas precision, menos...). Esto lo podemos encadenar con el classrank, o las métricas de conflicto que saquemos para ver la relación existente. Esto puede ser útil para saber qué clases tienen mayor potencial de aplicar estas técnicas y cuáles no.