In [120]:
from pathlib import Path

In [121]:
from rich import print

In [122]:
from paranames.util import read

In [123]:
import pandas as pd

In [187]:
# def read_tsv(path: Path) -> pd.DataFrame:
#     return read(path, "tsv")

### Load everything (obsolete)

In [125]:
# per_with_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER_instance_of_counts.tsv")
# per_without_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/PER_instance_of_counts.tsv")

In [126]:
# per_with = read_tsv(per_with_path)
# per_without = read_tsv(per_without_path)

In [127]:
# loc_with_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC_instance_of_counts.tsv")
# loc_without_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/LOC_instance_of_counts.tsv")

In [128]:
# loc_with = read_tsv(loc_with_path)
# loc_without = read_tsv(loc_without_path)

In [129]:
# org_with_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG_instance_of_counts.tsv")
# org_without_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/ORG_instance_of_counts.tsv")

In [130]:
# org_with = read_tsv(org_with_path)
# org_without = read_tsv(org_without_path)

In [131]:
# with_dfs = [per_with, loc_with]
# with_dfs.append(org_with)

In [132]:
# without_dfs = [per_without, loc_without]
# without_dfs.append(org_without)

In [133]:
# conll_types = ["PER", "LOC"]
# conll_types.append("ORG")

### How many entities <u>more</u> do we detect by using subclassing?

In [188]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-with*subclassing-3col/PER.jsonl

   8726412 /home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/PER.jsonl
   8730734 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl
  17457146 total


In [189]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-with*subclassing-3col/LOC.jsonl

     2923 /home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/LOC.jsonl
  3078459 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl
  3081382 total


In [190]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-with*subclassing-3col/ORG.jsonl

    58753 /home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/ORG.jsonl
  2196303 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl
  2255056 total


In [191]:
print(f"[PER] Difference = {8730734 - 8726412}")
print(f"[LOC] Difference = {3078459 - 2923}")
print(f"[ORG] Difference = {2196303 - 58753}")

### PER

First let's create a set of all entities that are a subclass of `Q5`: 

In [141]:
from qwikidata.sparql import get_subclasses_of_item

In [142]:
subclass_of_human = set(get_subclasses_of_item('Q5', return_qids=True))

Then let's loop through *all* entities categorized as `PER` (but not an instance of `Q5`) and see what makes them `PER`. 

In [192]:
import orjson
from rich.progress import track
from collections import Counter

In [193]:
import pymongo
client = pymongo.MongoClient(port=27617)
db = client['paranames_db_022822']
coll = db['paranames']

In [194]:
per_entities_jsonl_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl")

In [195]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl

8730734 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl


In [196]:
def grab_first_available(label_dict):
    if not label_dict:
        return ''
    first_lang = [l for l in label_dict].pop(0)
    return label_dict[first_lang]

In [197]:
why_per = {}
with open(per_entities_jsonl_path, 'r', encoding="utf-8") as fin:
    look_up_these = set()
    for ix, line in track(enumerate(fin), total=8730733, description="Checking what makes PER entities PER..."):
        row = orjson.loads(line)
        wid = row['wikidata_id']
        name = row['name']
        instance_of = set(row['instance_of'])
        if "Q5" in instance_of:
            continue
        common = instance_of.intersection(subclass_of_human)
        if len(common) == 0:
            print(f"[{wid}] Entity '{name}' is not an instance of Q5 or any of its subclasses.")
        else:
            look_up_these.update(common)
            why_per[wid] = common
            
# Make the instance-of names human readable
looked_up_names = {
    r['id']: r['name']
    if r['name'] 
    else grab_first_available(r['labels']) 
    for r in coll.find(
        {"id": {"$in": list(look_up_these)}}
    )
}

why_per_human_readable = {}
for wid, inst_of_set in track(why_per.items(), description="Making the IDs human readable..."):
    why_per_human_readable[wid] = {
        (qid, looked_up_names.get(qid))
        for qid in inst_of_set
    }

Output()

Output()

OK, every PER entity is an instance of something that is a subclass of Q5. 

Now we can tally why the PER entities that were only detected with subclasses were categorized as such:

In [200]:
why_per_counts = Counter()
for _, inst_of in why_per_human_readable.items():
    string_labels =  " & ".join([f"{name} ({qid})" or qid for qid, name in inst_of])
    why_per_counts[string_labels] += 1

In [201]:
why_per_counts.most_common()

[('federally recognized Native American tribe in the United States (Q7840353)',
  164),
 ('human fetus (Q26513)', 68),
 ('hypothetical person (Q75855169)', 51),
 ('Police and Crime Commissioner (Q58333)', 39),
 ('minister (Q83307)', 36),
 ('inhabitant (Q22947)', 32),
 ('Permanent Secretary (Q57901836)', 28),
 ('fictional human formerly considered to be historical (Q64520857)', 26),
 ('Director of Bureau (Q23931359)', 25),
 ('ambassador to the United States of America (Q19359052)', 24),
 ('Native Americans in the United States (Q49297)', 24),
 ('ombudsman (Q169180)', 23),
 ('civil servant (Q212238)', 23),
 ('indigenous peoples of the Americas (Q36747)', 22),
 ('passenger (Q319604)', 20),
 ("ambassador to the People's Republic of China (Q30064335)", 19),
 ('prisoner of a Nazi concentration camp (Q1719325) & prisoner (Q1862087)',
  16),
 ('board member (Q2824523)', 16),
 ('Native American tribe (Q12885585)', 13),
 ('ambassador of the United States of America (Q15726790)', 13),
 ('judge (Q

### Where do these weird inheritance hierarchies come from?

This will be infeasible to analyze for everything but here are a few examples:

For example, `Quechan people` is an instance of `indigenous peoples of the Americas ` (Q36747).

In [159]:
%%bash

rg "Q36747" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl | head -n 1 | jq

{
  "wikidata_id": "Q1754503",
  "name": "Quechan people",
  "type": "PER",
  "instance_of": [
    "Q7840353",
    "Q36747",
    "Q133311"
  ]
}


We can check what it's an instance of with human-readable names:

In [161]:
why_per_human_readable["Q1754503"]

{('Q36747', 'indigenous peoples of the Americas'),
 ('Q7840353',
  'federally recognized Native American tribe in the United States')}

Looking at [`indigenous peoples of the Americas`](https://www.wikidata.org/wiki/Q36747), we can see that it is a subclass of [`Americans`](https://www.wikidata.org/wiki/Q2384959), which in turn is a subclass of [`inhabitant`](https://www.wikidata.org/wiki/Q22947), which itself is a subclass of [`human`](https://www.wikidata.org/wiki/Q5). 

This implies that anything that is an instance of `indigenous peoples of the Americas` is also an instance of `human`.

A second example: [`Louise Victoria, Princess of Grão-Pará`](https://www.wikidata.org/wiki/Q51883211):

In [205]:
%%bash

rg "Q26513" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl | head -n 2 | tail -n1 | jq

{
  "wikidata_id": "Q51883211",
  "name": "Louise Victoria, Princess of Grão-Pará",
  "type": "PER",
  "instance_of": [
    "Q26513"
  ]
}


is an instance of [`human fetus`](https://www.wikidata.org/wiki/Q26513).

[`human fetus`](https://www.wikidata.org/wiki/Q26513) is an instance of [`prenate`](https://www.wikidata.org/wiki/Q63177820) which is a subclass of [`human`](https://www.wikidata.org/wiki/Q5).

Thus, `Louise Victoria, Princess of Grão-Pará` is PER.

Third example: [`Marcus Horatius`](https://www.wikidata.org/wiki/Q55463614)

In [208]:
%%bash

rg "Q75855169" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl | head -n 1 | jq

{
  "wikidata_id": "Q55463614",
  "name": "Marcus Horatius",
  "type": "PER",
  "instance_of": [
    "Q75855169"
  ]
}


is an instance of [`hypothetical person (Q75855169)`](https://www.wikidata.org/wiki/Q75855169) which is a subclass of `human` and thus `Marcus Horatius` is also a PER.

### LOC

In [216]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl

3078459 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl


In [213]:
loc_entities_jsonl_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl")
subclass_of_geographic_region = get_subclasses_of_item("Q82794")

In [222]:
why_loc = {}
with open(loc_entities_jsonl_path, 'r', encoding="utf-8") as fin:
    look_up_these_loc = set()
    for ix, line in track(enumerate(fin), total=3078459, description="Checking what makes LOC entities LOC..."):
        row = orjson.loads(line)
        wid = row['wikidata_id']
        name = row['name']
        instance_of = set(row['instance_of'])
        if "Q82794" in instance_of:
            continue
        common = instance_of.intersection(subclass_of_geographic_region)
        if len(common) == 0:
            look_up_these_org.update(inst_of)
            why_loc[wid] = inst_of
        else:
            look_up_these_loc.update(common)
            why_loc[wid] = common
            a
# Make the instance-of names human readable
looked_up_loc = {
    r['id']: r['name']
    if r['name'] 
    else grab_first_available(r['labels']) 
    for r in coll.find(
        {"id": {"$in": list(look_up_these_loc)}}
    )
}

why_loc_human_readable = {}
for wid, inst_of_set in track(why_loc.items(), description="Making the IDs human readable..."):
    why_loc_human_readable[wid] = {
        (qid, looked_up_loc.get(qid))
        for qid in inst_of_set
    }

Output()

Output()

In [223]:
why_loc_counts = Counter()
for _, inst_of in why_loc_human_readable.items():
    string_labels =  " & ".join([f"{name} ({qid})" or qid for qid, name in inst_of])
    why_loc_counts[string_labels] += 1

In [224]:
why_loc_counts.most_common()

[('river (Q4022)', 273845),
 ("None (('Q99880596', None))", 217650),
 ('village (Q532)', 124843),
 ('stream (Q47521)', 103941),
 ('lake (Q23397)', 95057),
 ('island (Q23442)', 86427),
 ('watercourse (Q355304)', 79873),
 ('village in India (Q56436498)', 71708),
 ('park (Q22698)', 70824),
 ('valley (Q39816)', 70018),
 ('townland (Q2151232)', 57931),
 ('protected area (Q473972)', 53469),
 ('natural watercourse (Q55659167)', 53350),
 ('spring (Q124714)', 50063),
 ('village of Poland (Q3558970)', 49645),
 ('commune of France (Q484170)', 39855),
 ('canal (Q12284)', 37270),
 ('wadi (Q187971)', 35350),
 ('fourth-level administrative division in Indonesia (Q2225692) & village (Q532)',
  33927),
 ('cemetery (Q39614)', 26815),
 ('desa (Q26211545)', 25079),
 ('ward (Q1195098)', 24995),
 ('village of Ukraine (Q21672098)', 24982),
 ('college (Q189004)', 24016),
 ('None (Q104093746)', 23327),
 ('Ortsteil (Q253019)', 23046),
 ('archaeological site (Q839954)', 22993),
 ('bay (Q39594)', 22532),
 ('boden

Some examples:

TODO

### ORG

In [226]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl

2196303 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl


In [225]:
org_entities_jsonl_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl")
subclass_of_organization = get_subclasses_of_item("Q43229")

In [None]:
why_org = {}
with open(org_entities_jsonl_path, 'r', encoding="utf-8") as fin:
    look_up_these_org = set()
    for ix, line in track(enumerate(fin), total=2196302, description="Checking what makes ORG entities ORG..."):
        row = orjson.loads(line)
        wid = row['wikidata_id']
        name = row['name']
        instance_of = set(row['instance_of'])
        if "Q43229" in instance_of:
            continue
        common = instance_of.intersection(subclass_of_geographic_region)
        why_org_value = common if len(common) > 0 else inst_of
        look_up_these_org.update(why_org_value)
        why_org[wid] = why_org_value
# Make the instance-of names human readable
looked_up_org = {
    r['id']: r['name']
    if r['name'] 
    else grab_first_available(r['labels']) 
    for r in coll.find(
        {"id": {"$in": list(look_up_these_org)}}
    )
}

why_org_human_readable = {}
for wid, inst_of_set in track(why_org.items(), description="Making the IDs human readable..."):
    why_org_human_readable[wid] = {
        (qid, looked_up_org.get(qid))
        for qid in inst_of_set
    }

Output()

In [None]:
why_loc_counts = Counter()
for _, inst_of in why_loc_human_readable.items():
    string_labels =  " & ".join([f"{name} ({qid})" or qid for qid, name in inst_of])
    why_loc_counts[string_labels] += 1