In [1]:
# Get wikidata taxonomy properties

import bioregistry
import requests
import pandas as pd
from bioregistry.utils import norm
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", None)

Extract a list of properties from the Wikidata [Taxonomy Properties](https://www.wikidata.org/wiki/Template:Taxonomy_properties) page.

In [2]:
URL = "https://www.wikidata.org/wiki/Template:Taxonomy_properties"

soup = BeautifulSoup(requests.get(URL).text, "html.parser")

In [None]:
wikidata_to_bioregistry = bioregistry.get_registry_invmap("wikidata")

In [None]:
row_indexes = [
    9,  # databases 
    11,  # references
]

properties = sorted(
    {
        (
            li.find("a").attrs["href"].removeprefix("/wiki/Property_talk:"),
            li.find("a").text
        )
        for row_index in row_indexes
        for ul in soup.find("table").find_all("tr")[row_index].find_all("ul")
        for li in ul.find_all("li")
    },
    key=lambda t: int(t[0][1:])
)

Prepare a dataframe for further investigation.

In [None]:
def propose_prefix(label: str) -> str:
    label = label.lower()
    if "." in label:
        return ""
    label = label.removesuffix(" id")
    label = label.removesuffix(" code")
    for suffix in ["author", "taxon"]:
        if label.endswith(f" {suffix}"):
            label = label[:-len(suffix)-1] + f".{suffix}"
    if " " in label or "-" in label:
        return ""
    return label
    
    
def proposal_conflict(proposal: str) -> bool:
    if not proposal:
        return False
    if bioregistry.normalize_prefix(proposal):
        return True
    return False

df = pd.DataFrame(properties, columns=["property", "label"])
df["in_bioregistry"] = df["property"].map(wikidata_to_bioregistry.__contains__)
df["proposed_prefix"] = df["label"].map(propose_prefix)
df["proposed_prefix_conflict"] = df["proposed_prefix"].map(proposal_conflict)

The following Wikidata properties are already mapped in the Bioregistry to a prefix.

In [None]:
df.loc[df.in_bioregistry, ["property", "label"]]

The following Wikidata properties are not mapped to the Bioregistry, but can lexically mapped based on the label.

In [None]:
df.loc[
    ~df.in_bioregistry & df.proposed_prefix_conflict, 
    ["property", "label", "proposed_prefix"],
]

The following Wikidata properties are not mapped to the Bioregistry, and could be assigned a prefix by simple string operations on the label.

In [None]:
df.loc[
    (~df.in_bioregistry) & (~df.proposed_prefix_conflict) & (df.proposed_prefix != ""), 
    ["property", "label", "proposed_prefix"]
]

The remaining Wikidata properties can not be mapped to the Bioregistry based on property and have complicated names that would require more careful assignment of prefixes.

In [None]:
df.loc[
    (~df.in_bioregistry) & (~df.proposed_prefix_conflict) & (df.proposed_prefix == ""), 
    ["property", "label"]
]