In [1]:
from pathlib import Path
import os, sys

import kgdata

sys.path.append(str(Path(kgdata.__file__).parent.parent / "scripts"))

In [2]:
from cycle_breaker import setup_env

setup_env()

[32m2024-08-16 22:26:06.735[0m | [1mINFO    [0m | [36mkgdata.wikidata.config[0m:[36minit[0m:[36m135[0m - [1mWikidata directory: /workspace/sm-research/data/kgdata/wikidata/20240320[0m


In [3]:
from kgdata.wikidata.config import WikidataDirCfg
from kgdata.wikidata.datasets.classes import classes

cfg = WikidataDirCfg.get_instance()

In [4]:
from functools import partial
from kgdata.dataset import Dataset
from kgdata.db import deser_from_dict
from kgdata.wikidata.models.wdclass import WDClass


cls_datasets = Dataset(
    cfg.classes / f"full-en/*.gz",
    deserialize=partial(deser_from_dict, WDClass),
    name=f"classes/full/en",
    dependencies=[],
)

In [6]:
all_clses = cls_datasets.get_list()

read dataset:   0%|          | 0/65 [00:00<?, ?it/s]

In [7]:
records = all_clses

In [8]:
import rustworkx

from graph.retworkx import BaseEdge, BaseNode, RetworkXStrDiGraph
from graph.retworkx import api as retworkx_api

g = RetworkXStrDiGraph(check_cycle=False, multigraph=False)
for c in records:
    g.add_node(BaseNode(c.id))

for c in records:
    for cpid in c.parents:
        g.add_edge(BaseEdge(-1, c.id, cpid, 1))

In [15]:
from tqdm import tqdm

In [28]:
def all_cycles(g):
    out = []
    for nodeindices in rustworkx.simple_cycles(g._graph):
        cycle = []
        for uid in nodeindices:
            cycle.append(g._graph.get_node_data(uid).id)
        out.append(cycle)
    return out

In [29]:
cycles = all_cycles(g)

In [43]:
len(cycles)

38

In [35]:
from collections import Counter

Counter(len(x) for x in cycles)

Counter({2: 31, 3: 6, 4: 1})

In [40]:
del_edges = []
for cycle in cycles:
    id2c = {uid: clsdb[uid] for uid in cycle}
    for cid, c in id2c.items():
        # detect what should remove
        old_parents = set([e.target for e in g.out_edges(c.id)])
        del_edges.extend([(cid, cpid) for cpid in old_parents.difference(c.parents)])

In [41]:
del_edges

[('Q1144915', 'Q10925'),
 ('Q181296', 'Q83478'),
 ('Q154954', 'Q5'),
 ('Q114836842', 'Q2566598'),
 ('Q1143413', 'Q1083980'),
 ('Q154', 'Q266143'),
 ('Q6559431', 'Q111907435'),
 ('Q2295790', 'Q107596121'),
 ('Q11028', 'Q9081'),
 ('Q6500773', 'Q55645123'),
 ('Q1354775', 'Q5003624'),
 ('Q811430', 'Q811979')]

In [42]:
for edge in del_edges:
    g.remove_edges_between_nodes(edge[0], edge[1])

In [44]:
new_cycles = all_cycles(g)

In [47]:
new_cycles

[['Q57979447', 'Q57979691'],
 ['Q5767753', 'Q1326966'],
 ['Q18848', 'Q181296'],
 ['Q11946197', 'Q699', 'Q3306693'],
 ['Q63177917', 'Q63177820'],
 ['Q26513', 'Q63177820'],
 ['Q1532173', 'Q230047'],
 ['Q1501855', 'Q122961326'],
 ['Q2366588', 'Q4669603'],
 ['Q4669603', 'Q16856760'],
 ['Q2257937', 'Q381045'],
 ['Q24770198', 'Q24778143', 'Q13417346'],
 ['Q3009426', 'Q82671060'],
 ['Q3008312', 'Q82671060'],
 ['Q18554140', 'Q6152917'],
 ['Q7695332', 'Q116702093', 'Q36528'],
 ['Q118581330', 'Q118955299'],
 ['Q2514663', 'Q11572062'],
 ['Q112122809', 'Q29584854'],
 ['Q16675435', 'Q18033462'],
 ['Q6500773', 'Q55379489'],
 ['Q1075470', 'Q56003275', 'Q775593'],
 ['Q12140', 'Q28885102', 'Q12034612'],
 ['Q1476157', 'Q124363171'],
 ['Q1124833', 'Q21163245'],
 ['Q4880696', 'Q2883473'],
 ['Q212903', 'Q1394771'],
 ['Q828800', 'Q530012']]

In [16]:
cycles = list(tqdm(rustworkx.simple_cycles(g._graph)))

38it [00:00, 52.91it/s]


In [17]:
len(cycles)

38

In [27]:
g._graph.get_node_data(1657353).id, g._graph.get_node_data(632841).id

('Q57979447', 'Q57979691')

In [20]:
list(cycles[0])

[1657353, 632841]

In [37]:
from kgdata.wikidata.db import query_wikidata_entities, get_class_db

In [38]:
clsdb = get_class_db(cfg.modification / "classes.db", read_only=False, proxy=True)

In [39]:
clsdb["Q811430"].parents

['Q618123']

In [None]:
new_ents = query_wikidata_entities()