In [1]:
# Get wikidata taxonomy properties

import bioregistry
import requests
import pandas as pd
from bioregistry.utils import norm
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", None)

Extract a list of properties from the Wikidata [Taxonomy Properties](https://www.wikidata.org/wiki/Template:Taxonomy_properties) page.

In [2]:
URL = "https://www.wikidata.org/wiki/Template:Taxonomy_properties"

soup = BeautifulSoup(requests.get(URL).text, "html.parser")

In [3]:
wikidata_to_bioregistry = bioregistry.get_registry_invmap("wikidata")

In [4]:
row_indexes = [
    9,  # databases 
    11,  # references
]

properties = sorted(
    {
        (
            li.find("a").attrs["href"].removeprefix("/wiki/Property_talk:"),
            li.find("a").text
        )
        for row_index in row_indexes
        for ul in soup.find("table").find_all("tr")[row_index].find_all("ul")
        for li in ul.find_all("li")
    },
    key=lambda t: int(t[0][1:])
)

Prepare a dataframe for further investigation.

In [5]:
def propose_prefix(label: str) -> str:
    label = label.lower()
    if "." in label:
        return ""
    label = label.removesuffix(" id")
    label = label.removesuffix(" code")
    for suffix in ["author", "taxon"]:
        if label.endswith(f" {suffix}"):
            label = label[:-len(suffix)-1] + f".{suffix}"
    if " " in label or "-" in label:
        return ""
    return label
    
    
def proposal_conflict(proposal: str) -> bool:
    if not proposal:
        return False
    if bioregistry.normalize_prefix(proposal):
        return True
    return False

df = pd.DataFrame(properties, columns=["property", "label"])
df["in_bioregistry"] = df["property"].map(wikidata_to_bioregistry.__contains__)
df["proposed_prefix"] = df["label"].map(propose_prefix)
df["proposed_prefix_conflict"] = df["proposed_prefix"].map(proposal_conflict)

The following Wikidata properties are already mapped in the Bioregistry to a prefix.

In [6]:
df.loc[df.in_bioregistry, ["property", "label"]]

Unnamed: 0,property,label
2,P685,NCBI taxonomy ID
4,P815,ITIS TSN
5,P830,Encyclopedia of Life ID
8,P846,GBIF taxon ID
10,P938,FishBase species ID


The following Wikidata properties are not mapped to the Bioregistry, but can lexically mapped based on the label.

In [7]:
df.loc[
    ~df.in_bioregistry & df.proposed_prefix_conflict, 
    ["property", "label", "proposed_prefix"],
]

Unnamed: 0,property,label,proposed_prefix
26,P1832,GrassBase ID,grassbase
46,P2946,BacDive ID,bacdive
97,P5299,AntWeb ID,antweb


The following Wikidata properties are not mapped to the Bioregistry, and could be assigned a prefix by simple string operations on the label.

In [8]:
df.loc[
    (~df.in_bioregistry) & (~df.proposed_prefix_conflict) & (df.proposed_prefix != ""), 
    ["property", "label", "proposed_prefix"]
]

Unnamed: 0,property,label,proposed_prefix
0,P586,IPNI author ID,ipni.author
1,P627,IUCN taxon ID,iucn.taxon
6,P838,BioLib taxon ID,biolib.taxon
11,P959,MSW ID,msw
12,P960,Tropicos ID,tropicos
21,P1745,VASCAN ID,vascan
28,P1939,Dyntaxa ID,dyntaxa
31,P1992,Plazi ID,plazi
32,P2006,ZooBank author ID,zoobank.author
35,P2026,Avibase ID,avibase


The remaining Wikidata properties can not be mapped to the Bioregistry based on property and have complicated names that would require more careful assignment of prefixes.

In [9]:
df.loc[
    (~df.in_bioregistry) & (~df.proposed_prefix_conflict) & (df.proposed_prefix == ""), 
    ["property", "label"]
]

Unnamed: 0,property,label
3,P687,BHL page ID
7,P842,Fossilworks ID for this taxon
9,P850,WoRMS-ID for taxa
13,P961,IPNI plant ID
14,P962,MycoBank taxon name ID
15,P1070,PlantList-ID
16,P1076,ICTV virus ID
17,P1348,AlgaeBase URL
18,P1391,Index Fungorum ID
19,P1421,GRIN URL
