In [1]:
# Get wikidata taxonomy properties

import bioregistry
import requests
import pandas as pd
from bioregistry.utils import norm, query_wikidata
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", None)

Extract a list of properties from the Wikidata [Taxonomy Properties](https://www.wikidata.org/wiki/Template:Taxonomy_properties) page.

In [2]:
URL = "https://www.wikidata.org/wiki/Template:Taxonomy_properties"

soup = BeautifulSoup(requests.get(URL).text, "html.parser")

In [3]:
wikidata_to_bioregistry = bioregistry.get_registry_invmap("wikidata")

In [4]:
row_indexes = [
    9,  # databases 
    11,  # references
]

properties = sorted(
    {
        li.find("a").attrs["href"].removeprefix("/wiki/Property_talk:")
        for row_index in row_indexes
        for ul in soup.find("table").find_all("tr")[row_index].find_all("ul")
        for li in ul.find_all("li")
    },
    key=lambda t: int(t[1:])
)

In [5]:
values = " ".join(
    f"wd:{prop}"
    for prop in properties 
)

sparql = """\
SELECT
    ?property
    ?propertyLabel 
    (SAMPLE(?homepage) as ?homepage_sample)
    (SAMPLE(?format) as ?format_sample) 
    ?pattern
    (SAMPLE(?database) as ?database_sample)
    (SAMPLE(?qvalue) as ?example_sample)
WHERE {
    VALUES ?property { %s }
    OPTIONAL { ?property wdt:P1896 ?homepage } .
    OPTIONAL { ?property wdt:P1793 ?pattern }  .
    OPTIONAL { ?property wdt:P1630 ?format } .
    OPTIONAL { ?property wdt:P1629 ?database } .
    OPTIONAL { 
      ?property p:P1855 ?statement .
      ?statement ps:P1855 ?example .
      OPTIONAL { 
        ?statement ?qprop ?qvalue . 
        FILTER(STRSTARTS(STR(?qprop), "http://www.wikidata.org/prop/qualifier/")) .
        FILTER(STRENDS(STR(?qprop), SUBSTR(STR(?property), 32))) .
      }
  }
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
GROUP BY ?property ?propertyLabel ?pattern
""" % values

def query_wikidata_wrapper(sparql):
    rv = []
    for result in query_wikidata(sparql):
        rv.append({
            key: value["value"]
            for key, value in result.items()
        })
    return rv

results = query_wikidata_wrapper(sparql)
df = pd.DataFrame(results)
df["property"] = df["property"].map(lambda s: s.removeprefix("http://www.wikidata.org/entity/"))

Prepare a dataframe for further investigation.

In [6]:
def propose_prefix(label: str) -> str:
    label = label.lower()
    if "." in label:
        return ""
    label = label.removesuffix(" id")
    label = label.removesuffix(" code")
    for suffix in ["author", "taxon", "publication", "journal", "plant"]:
        if label.endswith(f" {suffix}"):
            label = label[:-len(suffix)-1] + f".{suffix}"
    for suffix in [".net", ".com"]:
        label = label.removesuffix(suffix)
    if " " in label or "-" in label:
        return ""
    return label
    
    
def proposal_conflict(proposal: str):
    if not proposal:
        return None
    return bioregistry.normalize_prefix(proposal)


df["bioregistry_prefix"] = df["property"].map(wikidata_to_bioregistry)
df["proposed_prefix"] = df["propertyLabel"].map(propose_prefix)
df["proposed_prefix_conflict"] = df["proposed_prefix"].map(proposal_conflict)

In [7]:
# focus on high-quality entries (e.g., have a homepage, pattern, and format string)
df_hq = df[
    df.homepage_sample.notna() 
    & df.pattern.notna()
    & df.format_sample.notna()
].copy().reset_index()
del df_hq["index"]

## Already in Bioregistry

The following Wikidata properties are already mapped in the Bioregistry to a prefix.

In [8]:
df_hq.loc[
    df_hq.bioregistry_prefix.notna(),
    ["property", "propertyLabel", "bioregistry_prefix"]
]

Unnamed: 0,property,propertyLabel,bioregistry_prefix
28,P830,Encyclopedia of Life ID,eolife
129,P685,NCBI taxonomy ID,ncbitaxon
130,P815,ITIS TSN,itis


## Need Manual Curation in Bioregistry

The following Wikidata properties are not mapped to the Bioregistry, but can lexically mapped based on the label.

In [9]:
df_hq.loc[
    df_hq.proposed_prefix_conflict.notna(),
    ["property", "propertyLabel", "proposed_prefix_conflict"],
]

Unnamed: 0,property,propertyLabel,proposed_prefix_conflict
44,P5299,AntWeb ID,antweb
108,P1832,GrassBase ID,grassbase


## Ready For Automated Ingestion

The following Wikidata properties are not mapped to the Bioregistry, and could be assigned a prefix by simple string operations on the label.

In [10]:
df_hq[
    (df_hq.bioregistry_prefix.isna()) 
    & (df_hq.proposed_prefix_conflict.isna()) 
    & (df_hq.proposed_prefix != "")
]

Unnamed: 0,property,propertyLabel,pattern,homepage_sample,format_sample,database_sample,example_sample,bioregistry_prefix,proposed_prefix,proposed_prefix_conflict
0,P2006,ZooBank author ID,[A-Z0-9]{8}(-[A-Z0-9]{4}){3}-[A-Z0-9]{12},http://zoobank.org/,http://zoobank.org/Authors/$1,http://www.wikidata.org/entity/Q8074026,945480F8-C4E7-41F4-A637-7F43CCF84D40,,zoobank.author,
2,P3102,Plantarium ID,\d+,http://www.plantarium.ru/,http://www.plantarium.ru/page/view/item/$1.html,http://www.wikidata.org/entity/Q59786454,33541,,plantarium,
3,P4125,Titan ID,\d+,http://titan.gbif.fr/,http://titan.gbif.fr/sel_genann1.php?numero=$1,http://www.wikidata.org/entity/Q59801025,4552,,titan,
4,P4728,uBio ID,[1-9]\d*,http://www.ubio.org/index.php?pagename=namebank,http://www.ubio.org/browser/details.php?nameba...,http://www.wikidata.org/entity/Q3551271,4003479,,ubio,
5,P4758,MONA ID,"\d{2,5}(.\d)?",http://mothphotographersgroup.msstate.edu/Chec...,http://mothphotographersgroup.msstate.edu/spec...,http://www.wikidata.org/entity/Q59786087,5546,,mona,
12,P9889,NZTCS ID,"[1-9]\d{3,7}",https://nztcs.org.nz/,https://nztcs.org.nz/nztcs-species/$1,http://www.wikidata.org/entity/Q108424413,11974,,nztcs,
20,P6163,NAS ID,[1-9]\d*,https://nas.er.usgs.gov/queries/FactSheetList....,https://nas.er.usgs.gov/queries/FactSheet.aspx...,http://www.wikidata.org/entity/Q58786227,1269,,nas,
29,P961,IPNI plant ID,"[1-9]\d{0,7}-[123]",http://www.ipni.org/,https://www.ipni.org/n/$1,http://www.wikidata.org/entity/Q922063,http://ipni.org/urn:lsid:ipni.org:names:323572-2,,ipni.plant,
31,P3064,LepIndex ID,"[1-9]\d{0,6}",http://www.nhm.ac.uk/jdsml/research-curation/r...,https://www.nhm.ac.uk/our-science/data/lepinde...,http://www.wikidata.org/entity/Q7736786,4433,,lepindex,
32,P3101,FloraBase ID,\d+,http://florabase.dec.wa.gov.au,https://florabase.dpaw.wa.gov.au/browse/profil...,http://www.wikidata.org/entity/Q5460267,913,,florabase,


## Need Manually Assigned Prefixes

The remaining Wikidata properties can not be mapped to the Bioregistry based on property and have complicated names that would require more careful assignment of prefixes.

In [11]:
df_hq[
    (df_hq.bioregistry_prefix.isna()) 
    & (df_hq.proposed_prefix_conflict.isna()) 
    & (df_hq.proposed_prefix == "")
]

Unnamed: 0,property,propertyLabel,pattern,homepage_sample,format_sample,database_sample,example_sample,bioregistry_prefix,proposed_prefix,proposed_prefix_conflict
1,P2794,Index Hepaticarum ID,\d+,http://www.ville-ge.ch/musinfo/bd/cjb/hepatic/...,http://www.ville-ge.ch/musinfo/bd/cjb/hepatic/...,http://www.wikidata.org/entity/Q23937518,30665,,,
6,P5003,Amphibians of India ID,[0-9]+,http://www.indianamphibians.org/,http://www.indianamphibians.org/#!/sp/$1,http://www.wikidata.org/entity/Q59784234,402,,,
7,P6347,The White-files species ID,[1-9]\d*,https://www.hemiptera-databases.org/whiteflies...,https://www.hemiptera-databases.org/whiteflies...,http://www.wikidata.org/entity/Q60332065,1124,,,
8,P6376,Psyl'list species ID,[1-9]\d*,https://www.hemiptera-databases.org/psyllist/?...,https://www.hemiptera-databases.org/psyllist?d...,http://www.wikidata.org/entity/Q60535830,1930,,,
9,P6487,Illustrated catalog of Tessaratomidae species ID,[1-9]\d*,https://www.hemiptera-databases.org/cgi-bin/Te...,https://www.hemiptera-databases.org/cgi-bin/Te...,http://www.wikidata.org/entity/Q61409777,202,,,
10,P7715,World Flora Online ID,^wfo-\d{10}$,http://www.worldfloraonline.org/,http://www.worldfloraonline.org/taxon/$1,http://www.wikidata.org/entity/Q77076820,wfo-0000088131,,,
11,P9799,Palynodata taxa ID,[1-9]\d*,https://paleobotany.ru/palynodata/taxa,https://paleobotany.ru/palynodata/species/$1,http://www.wikidata.org/entity/Q108064445,27059,,,
13,P5354,Amphibian Species of the World ID,"[a-zA-Z]+(\/[a-zA-Z-]+){0,5}",http://research.amnh.org/vz/herpetology/amphib...,https://amphibiansoftheworld.amnh.org/Amphibia/$1,http://www.wikidata.org/entity/Q2844175,Anura/Leptodactylidae/Leiuperinae/Pseudopaludi...,,,
14,P6028,Hypericum MySpecies ID,[1-9]\d*,http://hypericum.myspecies.info/,http://hypericum.myspecies.info/taxonomy/term/$1,http://www.wikidata.org/entity/Q156935,557,,,
15,P6042,Echinoid Directory ID,[1-9]\d*,http://www.nhm.ac.uk/our-science/data/echinoid...,http://www.nhm.ac.uk/our-science/data/echinoid...,http://www.wikidata.org/entity/Q57687518,2670,,,
