In [2]:
from os.path import expanduser

WIKIDATA_DIR = expanduser("~/kgdata/wikidata")
WIKIPEDIA_DIR = expanduser("~/kgdata/wikipedia")

In [3]:
import deepdiff, serde.jl, pandas as pd
from loguru import logger
from kgdata.wikidata.db import (
    WikidataDB,
    get_class_db,
    get_prop_db,
    get_entity_db,
    WDClass,
    WDProperty,
)
from dataclasses import dataclass
from typing import *
from pathlib import Path
from kgdata.misc.modification import Modification

In [4]:
db = WikidataDB(expanduser("~/kgdata/databases/wikidata/20230619"))

### Cycle breaker

Figure out the incorrect relationships that should be removed to make the type graph acyclic.

In [12]:
classes = db.classes.cache()
props = db.props.cache()

In [14]:
cycles = [
    (("Q11606300", "Q55485"), ("Q55485", "Q11606300")),
    (("Q118422967", "Q12011063"), ("Q12011063", "Q118422967")),
    (("Q118455743", "Q1371037"), ("Q1371037", "Q118455743")),
    (("Q12140", "Q28885102"), ("Q28885102", "Q12140")),
    (("Q1371037", "Q346549"), ("Q346549", "Q1371037")),
    (("Q1371037", "Q845392"), ("Q845392", "Q1371037")),
    (("Q155076", "Q43229"), ("Q43229", "Q155076")),
    (("Q16023913", "Q189533"), ("Q189533", "Q16023913")),
    (("Q16023913", "Q196756"), ("Q196756", "Q16023913")),
    (("Q189533", "Q3529618"), ("Q3529618", "Q189533")),
    (("Q49371", "Q820655"), ("Q820655", "Q49371")),
    (("Q56827005", "Q77115"), ("Q77115", "Q56827005")),
]

In [54]:
classes["Q16023913"].parents

['Q196756']

In [53]:
i = 8
cycle = list({u for nodes in cycles[i] for u in nodes})

for cid in cycle:
    c = classes[cid]
    print(c.label, c.id)
    print({k: c.ancestors[k] for k in cycle if k in c.ancestors})

professional certification Q16023913
{'Q196756': 1}
certificate Q196756
{'Q16023913': 3}


In [7]:
lst = {tuple(sorted(x)) for x in lst}

In [8]:
lst

{(('Q11606300', 'Q55485'), ('Q55485', 'Q11606300')),
 (('Q118422967', 'Q12011063'), ('Q12011063', 'Q118422967')),
 (('Q118455743', 'Q1371037'), ('Q1371037', 'Q118455743')),
 (('Q12140', 'Q28885102'), ('Q28885102', 'Q12140')),
 (('Q1371037', 'Q346549'), ('Q346549', 'Q1371037')),
 (('Q1371037', 'Q845392'), ('Q845392', 'Q1371037')),
 (('Q155076', 'Q43229'), ('Q43229', 'Q155076')),
 (('Q16023913', 'Q189533'), ('Q189533', 'Q16023913')),
 (('Q16023913', 'Q196756'), ('Q196756', 'Q16023913')),
 (('Q189533', 'Q3529618'), ('Q3529618', 'Q189533')),
 (('Q49371', 'Q820655'), ('Q820655', 'Q49371')),
 (('Q56827005', 'Q77115'), ('Q77115', 'Q56827005'))}

### Modify classes

In [95]:
olddb = get_class_db(db.database_dir / "classes.db", read_only=True)

id2obj = {}
for obj in serde.jl.deser(db.database_dir / "classes.fixed.jl.old"):
    id2obj[obj['id']] = obj

def get_label(id):
    return f"{olddb[id].label} ({id})"

In [96]:
id2interven = Intervention.from_tsv("~/kgdata/wikidata/intervention/classes.tsv")

In [97]:
for id, obj in id2obj.items():
    if id not in olddb:
        logger.error("{} is not in db", get_label(id))
        continue
    
    print(">>> Process", get_label(id))
    oldobj = olddb[id].to_dict()
    obj.pop('ancestors', None)
    oldobj.pop('ancestors')
        
    diff = deepdiff.diff.DeepDiff(oldobj, obj)
    print('diff', diff)
    print([get_label(x) for x in oldobj['parents']])
    print([get_label(x) for x in obj['parents']])
    
    if id in id2interven:
        [mod.apply(oldobj) for mod in id2interven[id]]
        diff = deepdiff.diff.DeepDiff(oldobj, obj)
    assert diff == {}

>>> Process military unit (Q176799)
diff {'iterable_item_removed': {"root['parents'][2]": 'Q781132'}}
['military organization (Q15627509)', 'armed organization (Q17149090)', 'military branch (Q781132)', 'organizational subdivision (Q9261468)']
['military organization (Q15627509)', 'armed organization (Q17149090)', 'organizational subdivision (Q9261468)']
>>> Process house (Q3947)
diff {'values_changed': {"root['parents'][0]": {'new_value': 'Q41176', 'old_value': 'Q11755880'}}}
['residential building (Q11755880)']
['building (Q41176)']
>>> Process polysaccharides (Q134219)
diff {'iterable_item_removed': {"root['parents'][2]": 'Q2553138'}}
['carbohydrate (Q11358)', 'macromolecule (Q178593)', 'glycan (Q2553138)']
['carbohydrate (Q11358)', 'macromolecule (Q178593)']
>>> Process financial services (Q837171)
diff {'iterable_item_removed': {"root['parents'][3]": 'Q806750'}}
['product (Q2424752)', 'business service (Q25351891)', 'service (Q7406919)', 'banking services (Q806750)']
['product (Q2

## Modify properties

In [100]:
olddb = get_prop_db(db.database_dir / "props.db", read_only=True)

id2obj = {}
for obj in serde.jl.deser(db.database_dir / "props.fixed.jl.old"):
    id2obj[obj['id']] = obj

def get_label(id):
    return f"{olddb[id].label} ({id})"

In [101]:
id2interven = Intervention.from_tsv("~/kgdata/wikidata/intervention/props.tsv")

In [103]:
for id, obj in id2obj.items():
    if id not in olddb:
        logger.error("{} is not in db", get_label(id))
        continue
    
    print(">>> Process", get_label(id))
    oldobj = olddb[id].to_dict()
    obj.pop('ancestors', None)
    oldobj.pop('ancestors')
        
    diff = deepdiff.diff.DeepDiff(oldobj, obj)
    print('diff', diff)
    print([get_label(x) for x in oldobj['parents']])
    print([get_label(x) for x in obj['parents']])
    
    if id in id2interven:
        [mod.apply(oldobj) for mod in id2interven[id]]
        diff = deepdiff.diff.DeepDiff(oldobj, obj)
    assert diff == {}

>>> Process locator map image (P242)
diff {'iterable_item_removed': {"root['parents'][1]": 'P927'}}
['image (P18)', 'anatomical location (P927)']
['image (P18)']
>>> Process detail map (P1621)
diff {}
['locator map image (P242)']
['locator map image (P242)']
>>> Process route map (P15)
diff {}
['locator map image (P242)']
['locator map image (P242)']
>>> Process plan view image (P3311)
diff {}
['detail map (P1621)']
['detail map (P1621)']
>>> Process location (P276)
diff {'iterable_item_removed': {"root['parents'][0]": 'P7153'}}
['significant place (P7153)']
[]
>>> Process located in the administrative territorial entity (P131)
diff {}
['location (P276)', 'part of (P361)']
['location (P276)', 'part of (P361)']
>>> Process significant place (P7153)
diff {}
['location (P276)']
['location (P276)']
>>> Process country (P17)
diff {}
['located in the administrative territorial entity (P131)']
['located in the administrative territorial entity (P131)']
>>> Process country of citizenship (P27)