In [1]:
from os.path import expanduser

WIKIDATA_DIR = expanduser("~/kgdata/wikidata")
WIKIPEDIA_DIR = expanduser("~/kgdata/wikipedia")

In [2]:
import deepdiff, serde.jl, pandas as pd
from loguru import logger
from kgdata.wikidata.db import WikidataDB, get_class_db, get_prop_db, get_entity_db, WDClass, WDProperty
from dataclasses import dataclass
from typing import *
from pathlib import Path

In [5]:
@dataclass
class Intervention:
    objid: str
    action: Literal["list:remove", "list:add"]
    attrpath: list[str]
    value: str | int
    
    def apply(self, obj: dict):
        if self.action == 'list:remove':
            lst = self.get_item(obj, self.attrpath)
            lst.remove(self.value)
        elif self.action == 'list:add':
            lst = self.get_item(obj, self.attrpath)
            lst.append(self.value)
        else:
            raise NotImplementedError()
    
    def get_item(self, obj: dict, attrpath: list[str]):
        for attr in attrpath:
            obj = obj[attr]
        return obj

    def to_dict(self):
        return {'objid': self.objid, 'action': self.action, 'attrpath': ".".join(self.attrpath), 'value': self.value}
        
    @staticmethod
    def from_dict(obj: dict):
        obj['attrpath'] = obj['attrpath'].split(".")
        return Intervention(**obj)
        
    @staticmethod
    def from_tsv(file: Path | str) -> dict[str, list['Intervention']]:
        records = pd.read_csv(file, comment='#', delimiter='\t').to_dict('records')
        lst = list(map(Intervention.from_dict, records))
        idmap = {}
        for item in lst:
            if item.objid not in idmap:
                idmap[item.objid] = []
            idmap[item.objid].append(item)
        return idmap

In [81]:
db = WikidataDB(expanduser("~/kgdata/databases/wikidata/20211213"))

### Modify classes

In [95]:
olddb = get_class_db(db.database_dir / "classes.db", read_only=True)

id2obj = {}
for obj in serde.jl.deser(db.database_dir / "classes.fixed.jl.old"):
    id2obj[obj['id']] = obj

def get_label(id):
    return f"{olddb[id].label} ({id})"

In [96]:
id2interven = Intervention.from_tsv("~/kgdata/wikidata/intervention/classes.tsv")

In [97]:
for id, obj in id2obj.items():
    if id not in olddb:
        logger.error("{} is not in db", get_label(id))
        continue
    
    print(">>> Process", get_label(id))
    oldobj = olddb[id].to_dict()
    obj.pop('ancestors', None)
    oldobj.pop('ancestors')
        
    diff = deepdiff.diff.DeepDiff(oldobj, obj)
    print('diff', diff)
    print([get_label(x) for x in oldobj['parents']])
    print([get_label(x) for x in obj['parents']])
    
    if id in id2interven:
        [mod.apply(oldobj) for mod in id2interven[id]]
        diff = deepdiff.diff.DeepDiff(oldobj, obj)
    assert diff == {}

>>> Process military unit (Q176799)
diff {'iterable_item_removed': {"root['parents'][2]": 'Q781132'}}
['military organization (Q15627509)', 'armed organization (Q17149090)', 'military branch (Q781132)', 'organizational subdivision (Q9261468)']
['military organization (Q15627509)', 'armed organization (Q17149090)', 'organizational subdivision (Q9261468)']
>>> Process house (Q3947)
diff {'values_changed': {"root['parents'][0]": {'new_value': 'Q41176', 'old_value': 'Q11755880'}}}
['residential building (Q11755880)']
['building (Q41176)']
>>> Process polysaccharides (Q134219)
diff {'iterable_item_removed': {"root['parents'][2]": 'Q2553138'}}
['carbohydrate (Q11358)', 'macromolecule (Q178593)', 'glycan (Q2553138)']
['carbohydrate (Q11358)', 'macromolecule (Q178593)']
>>> Process financial services (Q837171)
diff {'iterable_item_removed': {"root['parents'][3]": 'Q806750'}}
['product (Q2424752)', 'business service (Q25351891)', 'service (Q7406919)', 'banking services (Q806750)']
['product (Q2

## Modify properties

In [100]:
olddb = get_prop_db(db.database_dir / "props.db", read_only=True)

id2obj = {}
for obj in serde.jl.deser(db.database_dir / "props.fixed.jl.old"):
    id2obj[obj['id']] = obj

def get_label(id):
    return f"{olddb[id].label} ({id})"

In [101]:
id2interven = Intervention.from_tsv("~/kgdata/wikidata/intervention/props.tsv")

In [103]:
for id, obj in id2obj.items():
    if id not in olddb:
        logger.error("{} is not in db", get_label(id))
        continue
    
    print(">>> Process", get_label(id))
    oldobj = olddb[id].to_dict()
    obj.pop('ancestors', None)
    oldobj.pop('ancestors')
        
    diff = deepdiff.diff.DeepDiff(oldobj, obj)
    print('diff', diff)
    print([get_label(x) for x in oldobj['parents']])
    print([get_label(x) for x in obj['parents']])
    
    if id in id2interven:
        [mod.apply(oldobj) for mod in id2interven[id]]
        diff = deepdiff.diff.DeepDiff(oldobj, obj)
    assert diff == {}

>>> Process locator map image (P242)
diff {'iterable_item_removed': {"root['parents'][1]": 'P927'}}
['image (P18)', 'anatomical location (P927)']
['image (P18)']
>>> Process detail map (P1621)
diff {}
['locator map image (P242)']
['locator map image (P242)']
>>> Process route map (P15)
diff {}
['locator map image (P242)']
['locator map image (P242)']
>>> Process plan view image (P3311)
diff {}
['detail map (P1621)']
['detail map (P1621)']
>>> Process location (P276)
diff {'iterable_item_removed': {"root['parents'][0]": 'P7153'}}
['significant place (P7153)']
[]
>>> Process located in the administrative territorial entity (P131)
diff {}
['location (P276)', 'part of (P361)']
['location (P276)', 'part of (P361)']
>>> Process significant place (P7153)
diff {}
['location (P276)']
['location (P276)']
>>> Process country (P17)
diff {}
['located in the administrative territorial entity (P131)']
['located in the administrative territorial entity (P131)']
>>> Process country of citizenship (P27)