In [1]:
# Get wikidata taxonomy properties

import bioregistry
import bioregistry.version
import requests
import pandas as pd
from bioregistry.utils import norm, query_wikidata
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", None)

In [2]:
bioregistry.version.get_version()

'0.5.47-dev'

# Property and Metadata Acquisition

Extract a list of properties from the Wikidata [Taxonomy Properties](https://www.wikidata.org/wiki/Template:Taxonomy_properties) page.

In [3]:
URL = "https://www.wikidata.org/wiki/Template:Taxonomy_properties"

soup = BeautifulSoup(requests.get(URL).text, "html.parser")

In [4]:
wikidata_to_bioregistry = bioregistry.get_registry_invmap("wikidata")

In [5]:
row_indexes = [
    9,  # databases
    11,  # references
]


def property_key(t):
    return int(t[1:])


properties = sorted(
    {
        li.find("a").attrs["href"].removeprefix("/wiki/Property_talk:")
        for row_index in row_indexes
        for ul in soup.find("table").find_all("tr")[row_index].find_all("ul")
        for li in ul.find_all("li")
    },
    key=property_key,
)

In [6]:
values = " ".join(f"wd:{prop}" for prop in properties)

sparql = (
    """\
SELECT
    ?property
    ?propertyLabel 
    ?propertyDescription
    (SAMPLE(?homepage) as ?homepage_sample)
    (SAMPLE(?format) as ?format_sample) 
    ?pattern
    (SAMPLE(?database) as ?database_sample)
    (SAMPLE(?qvalue) as ?example_sample)
WHERE {
    VALUES ?property { %s }
    OPTIONAL { ?property wdt:P1896 ?homepage } .
    OPTIONAL { ?property wdt:P1793 ?pattern }  .
    OPTIONAL { ?property wdt:P1630 ?format } .
    OPTIONAL { ?property wdt:P1629 ?database } .
    OPTIONAL { 
      ?property p:P1855 ?statement .
      ?statement ps:P1855 ?example .
      OPTIONAL { 
        ?statement ?qprop ?qvalue . 
        FILTER(STRSTARTS(STR(?qprop), "http://www.wikidata.org/prop/qualifier/")) .
        FILTER(STRENDS(STR(?qprop), SUBSTR(STR(?property), 32))) .
      }
  }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?property ?propertyLabel ?propertyDescription ?pattern
"""
    % values
)


def query_wikidata_wrapper(sparql):
    rv = []
    for result in query_wikidata(sparql):
        rv.append({key: value["value"] for key, value in result.items()})
    return rv


results = query_wikidata_wrapper(sparql)
df = pd.DataFrame(results)
df["property"] = df["property"].map(lambda s: s.removeprefix("http://www.wikidata.org/entity/"))
df = df.sort_values("property", key=lambda s: s.map(property_key))

Prepare a dataframe for further investigation.

In [7]:
def propose_prefix(label: str) -> str:
    label = label.lower()
    if "." in label:
        return ""
    label = label.removesuffix(" id")
    label = label.removesuffix(" code")
    for suffix in ["author", "taxon", "publication", "journal", "plant"]:
        if label.endswith(f" {suffix}"):
            label = label[: -len(suffix) - 1] + f".{suffix}"
    for suffix in [".net", ".com"]:
        label = label.removesuffix(suffix)
    if " " in label or "-" in label:
        return ""
    return label


def proposal_conflict(proposal: str):
    if not proposal:
        return None
    return bioregistry.normalize_prefix(proposal)


df["bioregistry_prefix"] = df["property"].map(wikidata_to_bioregistry)
df["proposed_prefix"] = df["propertyLabel"].map(propose_prefix)
df["proposed_prefix_conflict"] = df["proposed_prefix"].map(proposal_conflict)
df

Unnamed: 0,property,propertyLabel,propertyDescription,pattern,format_sample,database_sample,example_sample,homepage_sample,bioregistry_prefix,proposed_prefix,proposed_prefix_conflict
155,P586,IPNI author ID,numerical identifier for a person in the Inter...,"[1-9][0-9]{0,7}-[0-9]",https://www.ipni.org/a/$1,http://www.wikidata.org/entity/Q922063,12653-1,,,ipni.author,
65,P627,IUCN taxon ID,identifier for a taxon in the International Un...,"[1-9]\d{0,8}",https://apiv3.iucnredlist.org/api/v3/taxonredi...,http://www.wikidata.org/entity/Q48268,6736,,,iucn.taxon,
186,P685,NCBI taxonomy ID,identifer for a taxon in the Taxonomy Database...,"[1-9][0-9]{0,6}",https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/...,http://www.wikidata.org/entity/Q13711410,http://purl.uniprot.org/taxonomy/9986,http://www.ncbi.nlm.nih.gov/taxonomy,ncbitaxon,,
0,P687,BHL page ID,identifier in the Biodiversity Heritage Librar...,[1-9]\d*,https://biodiversitylibrary.org/page/$1,http://www.wikidata.org/entity/Q172266,53742531,,,,
187,P815,ITIS TSN,identifier for a taxon in the Integrated Taxon...,"[1-9]\d{1,6}",https://www.itis.gov/servlet/SingleRpt/SingleR...,http://www.wikidata.org/entity/Q82575,160330,https://www.itis.gov,itis,,
1,P830,Encyclopedia of Life ID,eol.org item reference number,"[1-9]\d{0,7}",https://eol.org/zh-TW/pages/$1,http://www.wikidata.org/entity/Q82486,110747,http://eol.org/api/docs/pages,eolife,,
15,P838,BioLib taxon ID,identifier for a taxon in the biological encyc...,"\d{1,7}",https://www.biolib.cz/en/taxon/id$1,http://www.wikidata.org/entity/Q12345690,138591,,,biolib.taxon,
66,P842,Fossilworks taxon ID,"identifier for an animal, plant, or microorgan...","([1-9]\d{0,5})",http://www.fossilworks.org/cgi-bin/bridge.pl?a...,http://www.wikidata.org/entity/Q796451,40565,http://www.fossilworks.org,fossilworks.taxon,fossilworks.taxon,fossilworks.taxon
67,P846,GBIF taxon ID,taxon identifier in GBIF,"([1-9]\d{0,8}|)",https://www.gbif.org/species/$1,http://www.wikidata.org/entity/Q1531570,3239598,,gbif,gbif.taxon,
156,P850,WoRMS-ID for taxa,identifier in the World Register of Marine Spe...,"[1-9]\d{0,6}",https://www.marinespecies.org/aphia.php?p=taxd...,http://www.wikidata.org/entity/Q604063,urn:lsid:marinespecies.org:taxname:145548,http://www.marinespecies.org/aphia.php?p=search,worms,,


# Triage for the Bioregistry

In [8]:
# focus on high-quality entries (e.g., have a homepage, pattern, and format string)
df_hq = (
    df[df.homepage_sample.notna() & df.pattern.notna() & df.format_sample.notna()]
    .copy()
    .reset_index()
)
del df_hq["index"]

## Already in Bioregistry

The following Wikidata properties are already mapped in the Bioregistry to a prefix.

In [9]:
df_hq.loc[df_hq.bioregistry_prefix.notna(), ["property", "propertyLabel", "bioregistry_prefix"]]

Unnamed: 0,property,propertyLabel,bioregistry_prefix
0,P685,NCBI taxonomy ID,ncbitaxon
1,P815,ITIS TSN,itis
2,P830,Encyclopedia of Life ID,eolife
3,P842,Fossilworks taxon ID,fossilworks.taxon
4,P850,WoRMS-ID for taxa,worms
14,P1832,GrassBase ID,grassbase
30,P3088,Catalogue of Life in Taiwan ID,col.taiwan
52,P5221,Tree of Life Web Project ID,tol
54,P5299,AntWeb ID,antweb
75,P6049,NOAA Fisheries Species Directory ID,noaa


## Need Manual Curation in Bioregistry

The following Wikidata properties are not mapped to the Bioregistry, but can lexically mapped based on the label.

In [10]:
df_hq.loc[
    df_hq.proposed_prefix_conflict.notna(),
    ["property", "propertyLabel", "proposed_prefix_conflict"],
]

Unnamed: 0,property,propertyLabel,proposed_prefix_conflict
3,P842,Fossilworks taxon ID,fossilworks.taxon
14,P1832,GrassBase ID,grassbase
54,P5299,AntWeb ID,antweb


## Ready For Automated Ingestion

The following Wikidata properties are not mapped to the Bioregistry, and could be assigned a prefix by simple string operations on the label.

In [11]:
df_hq[
    (df_hq.bioregistry_prefix.isna())
    & (df_hq.proposed_prefix_conflict.isna())
    & (df_hq.proposed_prefix != "")
][["property", "propertyLabel", "propertyDescription", "proposed_prefix"]]

Unnamed: 0,property,propertyLabel,propertyDescription,proposed_prefix
5,P959,MSW ID,identifier from Mammal Species of the World d...,msw
6,P961,IPNI plant ID,numerical identifier for a plant name in the I...,ipni.plant
9,P1745,VASCAN ID,identifier for a taxon in the Database of Vasc...,vascan
16,P1992,Plazi ID,identifier for a taxon treatment at Plazi.org,plazi
17,P2006,ZooBank author ID,identifier for an author at ZooBank,zoobank.author
18,P2007,ZooBank publication ID,identifier for a publication at ZooBank,zoobank.publication
19,P2008,IPNI publication ID,identifier for a publication in the Internatio...,ipni.publication
20,P2026,Avibase ID,"identifier for a species, subspecies, or genus...",avibase
27,P2833,ARKive ID,"identifier for a taxon, in the ARKive database",arkive
28,P3060,ButMoth ID,identifier for a butterfly or moth genus in th...,butmoth


## Need Manually Assigned Prefixes

The remaining Wikidata properties can not be mapped to the Bioregistry based on property and have complicated names that would require more careful assignment of prefixes.

In [12]:
df_hq[
    (df_hq.bioregistry_prefix.isna())
    & (df_hq.proposed_prefix_conflict.isna())
    & (df_hq.proposed_prefix == "")
][["property", "propertyLabel", "propertyDescription"]]

Unnamed: 0,property,propertyLabel,propertyDescription
7,P1070,PlantList-ID,identifier in 'The Plant List' database
8,P1727,Flora of North America taxon ID,identifier for a taxon in the Flora of North A...
10,P1746,ZooBank ID for name or act,identifier for a name or nomenclatural act at ...
11,P1747,Flora of China ID,identifier for a taxon in Flora of China (Engl...
12,P1761,Watson & Dallwitz family ID,familyID in Watson & Dallwitz: The families of...
13,P1772,USDA PLANTS ID,identifier in the United States Department of ...
15,P1895,Fauna Europaea ID,identifier for a taxon in Fauna Europaea
21,P2036,African Plant Database ID,"identifier for a plant taxon, in the Conservat..."
22,P2040,CITES Species+ ID,identifier for a taxon in the Species+ databas...
23,P2426,Xeno-canto species ID,identifier of a bird species in the Xeno-canto...
