In [1]:
from pathlib import Path
from rich import print
import pandas as pd

### How many entities <u>more</u> do we detect by using subclassing?

In [2]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-with*subclassing-3col/PER.jsonl

   8726412 /home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/PER.jsonl
   8730734 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl
  17457146 total


In [3]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-with*subclassing-3col/LOC.jsonl

     2923 /home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/LOC.jsonl
  3078459 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl
  3081382 total


In [4]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-with*subclassing-3col/ORG.jsonl

    58753 /home/jonne/datasets/paranames/analyze-instance-ofs-withoutsubclassing-3col/ORG.jsonl
  2196303 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl
  2255056 total


In [5]:
print(f"[PER] Difference = {8730734 - 8726412}")
print(f"[LOC] Difference = {3078459 - 2923}")
print(f"[ORG] Difference = {2196303 - 58753}")

### PER

First let's create a set of all entities that are a subclass of `Q5`: 

In [6]:
from qwikidata.sparql import get_subclasses_of_item

In [7]:
subclass_of_human = set(get_subclasses_of_item('Q5', return_qids=True))

Then let's loop through *all* entities categorized as `PER` (but not an instance of `Q5`) and see what makes them `PER`. 

In [8]:
import orjson
from rich.progress import track
from collections import Counter

In [9]:
import pymongo
client = pymongo.MongoClient(port=27617)
db = client['paranames_db_022822']
coll = db['paranames']

In [10]:
per_entities_jsonl_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl")

In [11]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl

8730734 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl


In [12]:
def grab_first_available(label_dict):
    if not label_dict:
        return ''
    first_lang = [l for l in label_dict].pop(0)
    return label_dict[first_lang]

In [13]:
why_per = {}
with open(per_entities_jsonl_path, 'r', encoding="utf-8") as fin:
    look_up_these = set()
    for ix, line in track(enumerate(fin), total=8730733, description="Checking what makes PER entities PER..."):
        row = orjson.loads(line)
        wid = row['wikidata_id']
        name = row['name']
        instance_of = set(row['instance_of'])
        if "Q5" in instance_of:
            continue
        common = instance_of.intersection(subclass_of_human)
        if len(common) == 0:
            print(f"[{wid}] Entity '{name}' is not an instance of Q5 or any of its subclasses.")
        else:
            look_up_these.update(common)
            why_per[wid] = common
            
# Make the instance-of names human readable
looked_up_names = {
    r['id']: r['name']
    if r['name'] 
    else grab_first_available(r['labels']) 
    for r in coll.find(
        {"id": {"$in": list(look_up_these)}}
    )
}

why_per_human_readable = {}
for wid, inst_of_set in track(why_per.items(), description="Making the IDs human readable..."):
    why_per_human_readable[wid] = {
        (qid, looked_up_names.get(qid))
        for qid in inst_of_set
    }

Output()

Output()

OK, every PER entity is an instance of something that is a subclass of Q5. 

Now we can tally why the PER entities that were only detected with subclasses were categorized as such:

In [14]:
why_per_counts = Counter()
for _, inst_of in why_per_human_readable.items():
    string_labels =  " & ".join([f"{name} ({qid})" or qid for qid, name in inst_of])
    why_per_counts[string_labels] += 1

In [15]:
why_per_counts.most_common()

[('federally recognized Native American tribe in the United States (Q7840353)',
  164),
 ('human fetus (Q26513)', 68),
 ('hypothetical person (Q75855169)', 51),
 ('Police and Crime Commissioner (Q58333)', 39),
 ('minister (Q83307)', 36),
 ('inhabitant (Q22947)', 32),
 ('Permanent Secretary (Q57901836)', 28),
 ('fictional human formerly considered to be historical (Q64520857)', 26),
 ('Director of Bureau (Q23931359)', 25),
 ('ambassador to the United States of America (Q19359052)', 24),
 ('Native Americans in the United States (Q49297)', 24),
 ('ombudsman (Q169180)', 23),
 ('civil servant (Q212238)', 23),
 ('indigenous peoples of the Americas (Q36747)', 22),
 ('passenger (Q319604)', 20),
 ("ambassador to the People's Republic of China (Q30064335)", 19),
 ('prisoner (Q1862087) & prisoner of a Nazi concentration camp (Q1719325)',
  16),
 ('board member (Q2824523)', 16),
 ('Native American tribe (Q12885585)', 13),
 ('ambassador of the United States of America (Q15726790)', 13),
 ('judge (Q

### Where do these weird inheritance hierarchies come from?

This will be infeasible to analyze for everything but here are a few examples:

For example, `Quechan people` is an instance of `indigenous peoples of the Americas ` (Q36747).

In [16]:
%%bash

rg "Q36747" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl | head -n 1 | jq

{
  "wikidata_id": "Q1754503",
  "name": "Quechan people",
  "type": "PER",
  "instance_of": [
    "Q7840353",
    "Q36747",
    "Q133311"
  ]
}


We can check what it's an instance of with human-readable names:

In [17]:
why_per_human_readable["Q1754503"]

{('Q36747', 'indigenous peoples of the Americas'),
 ('Q7840353',
  'federally recognized Native American tribe in the United States')}

Looking at [`indigenous peoples of the Americas`](https://www.wikidata.org/wiki/Q36747), we can see that it is a subclass of [`Americans`](https://www.wikidata.org/wiki/Q2384959), which in turn is a subclass of [`inhabitant`](https://www.wikidata.org/wiki/Q22947), which itself is a subclass of [`human`](https://www.wikidata.org/wiki/Q5). 

This implies that anything that is an instance of `indigenous peoples of the Americas` is also an instance of `human`.

A second example: [`Louise Victoria, Princess of Grão-Pará`](https://www.wikidata.org/wiki/Q51883211):

In [18]:
%%bash

rg "Q26513" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl | head -n 2 | tail -n1 | jq

{
  "wikidata_id": "Q51883211",
  "name": "Louise Victoria, Princess of Grão-Pará",
  "type": "PER",
  "instance_of": [
    "Q26513"
  ]
}


is an instance of [`human fetus`](https://www.wikidata.org/wiki/Q26513).

[`human fetus`](https://www.wikidata.org/wiki/Q26513) is an instance of [`prenate`](https://www.wikidata.org/wiki/Q63177820) which is a subclass of [`human`](https://www.wikidata.org/wiki/Q5).

Thus, `Louise Victoria, Princess of Grão-Pará` is PER.

Third example: [`Marcus Horatius`](https://www.wikidata.org/wiki/Q55463614)

In [19]:
%%bash

rg "Q75855169" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/PER.jsonl | head -n 1 | jq

{
  "wikidata_id": "Q55463614",
  "name": "Marcus Horatius",
  "type": "PER",
  "instance_of": [
    "Q75855169"
  ]
}


is an instance of [`hypothetical person (Q75855169)`](https://www.wikidata.org/wiki/Q75855169) which is a subclass of `human` and thus `Marcus Horatius` is also a PER.

### LOC

In [20]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl

3078459 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl


In [21]:
loc_entities_jsonl_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl")
subclass_of_geographic_region = get_subclasses_of_item("Q82794")

In [22]:
why_loc = {}
with open(loc_entities_jsonl_path, 'r', encoding="utf-8") as fin:
    look_up_these_loc = set()
    for ix, line in track(enumerate(fin), total=3078459, description="Checking what makes LOC entities LOC..."):
        row = orjson.loads(line)
        wid = row['wikidata_id']
        name = row['name']
        instance_of = set(row['instance_of'])
        if "Q82794" in instance_of:
            continue
        common = instance_of.intersection(subclass_of_geographic_region)
        why_loc_value = common if len(common) > 0 else instance_of
        assert all(type(thing) == str for thing in why_loc_value), row
        look_up_these_loc.update(why_loc_value)
        why_loc[wid] = why_loc_value

Output()

In [23]:
# Make the instance-of names human readable
looked_up_loc = {
    r['id']: r['name']
    if r['name'] 
    else grab_first_available(r['labels']) 
    for r in coll.find(
        {"id": {"$in": list(look_up_these_loc)}}
    )
}

In [24]:
looked_up_loc

{'Q10007123': 'Griqua state',
 'Q1000858': 'region of the Faroe Islands',
 'Q100154387': 'national military park',
 'Q100205512': 'administrative territorial entity of ancient Rome',
 'Q100222740': 'National Battlefield Park',
 'Q100252623': 'pakihi',
 'Q100268926': 'iberian settlement',
 'Q1002812': 'metropolitan borough',
 'Q100319996': 'resource conservation district',
 'Q1003207': 'photovoltaic power station',
 'Q1004435': 'athletics track',
 'Q1004887': 'thermal bath',
 'Q100503226': 'town of the United Kingdom',
 'Q100590008': 'notary district',
 'Q1005901': 'federal level',
 'Q100606393': 'Union of States',
 'Q1006357': 'hanging valley',
 'Q100649': 'blackwater river',
 'Q1006733': 'grassland',
 'Q1006835': 'Main Street',
 'Q1006876': 'borough in the United Kingdom',
 'Q100689072': 'quarter of La Spezia',
 'Q100701580': 'quarter of Cagliari',
 'Q100705184': 'Quarter of Catania',
 'Q1007870': 'art gallery',
 'Q100900880': 'crater lake',
 'Q1009249': 'national waterways in Germany

In [25]:
why_loc_human_readable = {}
for wid, inst_of_set in track(why_loc.items(), description="Making the IDs human readable..."):
    why_loc_human_readable[wid] = {
        (qid, looked_up_loc.get(qid))
        for qid in inst_of_set
    }

Output()

In [26]:
why_loc_counts = Counter()
for _, inst_of in why_loc_human_readable.items():
    string_labels =  " & ".join([f"{name} ({qid})" if name else qid for qid, name in inst_of])
    why_loc_counts[string_labels] += 1

In [27]:
why_loc_counts.most_common()

[('river (Q4022)', 273845),
 ('village (Q532)', 124843),
 ('stream (Q47521)', 103941),
 ('lake (Q23397)', 95057),
 ('island (Q23442)', 86427),
 ('watercourse (Q355304)', 79873),
 ('village in India (Q56436498)', 71708),
 ('park (Q22698)', 70824),
 ('valley (Q39816)', 70018),
 ('townland (Q2151232)', 57931),
 ('protected area (Q473972)', 53469),
 ('natural watercourse (Q55659167)', 53350),
 ('spring (Q124714)', 50063),
 ('village of Poland (Q3558970)', 49645),
 ('commune of France (Q484170)', 39855),
 ('canal (Q12284)', 37270),
 ('wadi (Q187971)', 35350),
 ('point (Q24529780)', 34000),
 ('village (Q532) & fourth-level administrative division in Indonesia (Q2225692)',
  33927),
 ('movie theater (Q41253)', 29663),
 ('cemetery (Q39614)', 26815),
 ('desa (Q26211545)', 25079),
 ('ward (Q1195098)', 24995),
 ('village of Ukraine (Q21672098)', 24982),
 ('college (Q189004)', 24016),
 ('Q104093746', 23327),
 ('Ortsteil (Q253019)', 23046),
 ('archaeological site (Q839954)', 22993),
 ('bay (Q39594)

### Examples of inheritance hierarchy

#### River

In [36]:
%%bash
rg "Q4022" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl | head -n 1 | jq

{
  "wikidata_id": "Q584",
  "name": "Rhine",
  "type": "LOC",
  "instance_of": [
    "Q1009249",
    "Q573344",
    "Q4022",
    "Q1267889"
  ]
}


In [37]:
why_loc_human_readable["Q584"]

{('Q1009249', 'national waterways in Germany'),
 ('Q1267889', 'waterway'),
 ('Q4022', 'river'),
 ('Q573344', 'main stream')}

[`Rhine`](https://www.wikidata.org/wiki/Q584) is an instance of [`waterway`](https://www.wikidata.org/wiki/Q1267889) which is a subclass of [`body of water`](https://www.wikidata.org/wiki/Q15324) which is a subclass of [`geographic region`](https://www.wikidata.org/wiki/Q82794). Thus `Rhine` is LOC.

#### Anglican diocese

In [38]:
%%bash
rg "Q18917976" /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/LOC.jsonl | head -n 1 | jq

{
  "wikidata_id": "Q375891",
  "name": "Diocese of Southwell and Nottingham",
  "type": "LOC",
  "instance_of": [
    "Q18917976"
  ]
}


In [39]:
why_loc_human_readable["Q375891"]

{('Q18917976', 'Anglican diocese')}

[`Diocese of Southwell and Nottingham`](https://www.wikidata.org/wiki/Q375891) is an instance of [`Anglican diocese`](https://www.wikidata.org/wiki/Q18917976) which is a subclass of [`diocese`](https://www.wikidata.org/wiki/Q665487). Diocese is a subclass of [`religious administrative territorial entity`](https://www.wikidata.org/wiki/Q20926517) which is a subclass of [`human-geographic territorial entity`](https://www.wikidata.org/wiki/Q15642541) which is a subclass of [`territory`](https://www.wikidata.org/wiki/Q4835091) which is a subclass of [`geographic region`](https://www.wikidata.org/wiki/Q82794).

### ORG

In [29]:
%%bash
wc -l /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl

2196303 /home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl


In [30]:
org_entities_jsonl_path = Path("/home/jonne/datasets/paranames/analyze-instance-ofs-withsubclassing-3col/ORG.jsonl")
subclass_of_organization = get_subclasses_of_item("Q43229")

In [31]:
why_org = {}
with open(org_entities_jsonl_path, 'r', encoding="utf-8") as fin:
    look_up_these_org = set()
    for ix, line in track(enumerate(fin), total=2196302, description="Checking what makes ORG entities ORG..."):
        row = orjson.loads(line)
        wid = row['wikidata_id']
        name = row['name']
        instance_of = set(row['instance_of'])
        if "Q43229" in instance_of:
            continue
        common = instance_of.intersection(subclass_of_geographic_region)
        why_org_value = common if len(common) > 0 else instance_of
        assert all(type(thing) == str for thing in why_org_value), row
        look_up_these_org.update(why_org_value)
        why_org[wid] = why_org_value

Output()

In [32]:
# Make the instance-of names human readable
looked_up_org = {
    r['id']: r['name']
    if r['name'] 
    else grab_first_available(r['labels']) 
    for r in coll.find(
        {"id": {"$in": list(look_up_these_org)}}
    )
}



In [33]:
why_org_human_readable = {}
for wid, inst_of_set in track(why_org.items(), description="Making the IDs human readable..."):
    why_org_human_readable[wid] = {
        (qid, looked_up_org.get(qid))
        for qid in inst_of_set
    }

Output()

In [34]:
why_org_counts = Counter()
for _, inst_of in why_org_human_readable.items():
    string_labels =  " & ".join([f"{name} ({qid})" or qid for qid, name in inst_of])
    why_org_counts[string_labels] += 1

In [35]:
why_org_counts.most_common()

[('clinical trial (Q30612)', 341753),
 ('business (Q4830453)', 101372),
 ('musical group (Q215380)', 71787),
 ('rural school (Q19855165) & co-educational school (Q67383935) & primary school (Q9842)',
  53663),
 ('primary school (Q9842)', 49341),
 ('commune of France (Q484170)', 39855),
 ('business (Q4830453) & None (Q6881511)', 29996),
 ('newspaper (Q11032)', 29836),
 ('State Bank of India branch (Q65954115)', 25261),
 ("Government Boys' Primary School (Q67015940)", 25058),
 ('association football club (Q476028)', 24459),
 ('kindergarten (Q126807)', 24186),
 ('college (Q189004)', 23997),
 ('radio station (Q14350)', 22823),
 ('high school (Q9826)', 20351),
 ('school (Q3914)', 19785),
 ('None (Q33506)', 16895),
 ('nonprofit organization (Q163740)', 15434),
 ('hotel (Q27686)', 15317),
 ('political party (Q7278)', 14411),
 ('Gram panchayat (Q2732840)', 14019),
 ('academic department (Q2467461)', 13258),
 ('military unit (Q176799)', 12949),
 ('dissolved municipality of Japan (Q18663566)', 1