<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#All-wikidata-properties" data-toc-modified-id="All-wikidata-properties-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>All wikidata properties</a></span></li><li><span><a href="#Filter-properties" data-toc-modified-id="Filter-properties-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Filter properties</a></span></li><li><span><a href="#Get-wikidata-ID" data-toc-modified-id="Get-wikidata-ID-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get wikidata ID</a></span></li><li><span><a href="#Get-properties" data-toc-modified-id="Get-properties-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Get properties</a></span></li></ul></div>

# All wikidata properties

In [1]:
## wikidata property lookup table 

# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """#All properties with descriptions and aliases and types
SELECT ?property ?propertyTypeLabel ?propertyLabel ?propertyDescription ?propertyAltLabel WHERE {
  ?property wikibase:propertyType ?propertyType.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  
}
ORDER BY (xsd:integer(STRAFTER(STR(?property), "P")))"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [2]:
from pandas.io.json import json_normalize

results = get_results(endpoint_url, query)
all_wikidata_properties = json_normalize(results['results']['bindings'])

# get results 
all_wikidata_properties = all_wikidata_properties[['property.value',
                                                   'propertyTypeLabel.value',
                                                   'propertyLabel.value',
                                                   'propertyDescription.value',
                                                   'propertyAltLabel.value']]

all_wikidata_properties.columns = ['property_ID', 'DataType', 'property_name','description','examples']

# change formats
all_wikidata_properties['property_ID'] = all_wikidata_properties['property_ID'].str[31:]
all_wikidata_properties['DataType'] = all_wikidata_properties['DataType'][0][26:]

all_wikidata_properties.head()

Unnamed: 0,property_ID,DataType,property_name,description,examples
0,P6,WikibaseItem,head of government,"head of the executive power of this town, city...","president, chancellor, mayor, prime minister, ..."
1,P10,WikibaseItem,video,"relevant video. For images, use the property P...","animation, media, gif, trailer (Commons)"
2,P14,WikibaseItem,traffic sign,"graphic symbol describing the item, used at th...","highway shield, shield, highway marker, motorw..."
3,P15,WikibaseItem,route map,image of route map at Wikimedia Commons,"schema, highway map, map of route, metro map, ..."
4,P16,WikibaseItem,highway system,system (or specific country specific road type...,"transport network, network of routes, part of ..."


In [3]:
len(all_wikidata_properties)

8799

So as we can see, wikidata has totally 8798 types of properties/relations.

# Filter properties

There are too many perperties, but we are going to only keep those whose datatype is WikibaseItem, and also the properties that are useful for us to find the hyponyms, hypernyms, synonyms etc.

https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/Top100

In [24]:
top100 = ['cites work',
        'series ordinal',
        'author name string',
        'instance of',
        'stated in',
        'retrieved',
        'PubMed ID',
        'reference URL',
        'publication date',
        'title',
        'published in',
        'page(s)',
        'volume',
        'apparent magnitude',
        'astronomical filter',
        'issue',
        'catalog code',
        'DOI',
        'catalog',
        'author',
        'language of work or name',
        'main subject',
        'country',
        'PMCID',
        'of',
        'located in the administrative territorial entity',
        'proper motion',
        'point in time',
        'determination method',
        'stated as',
        'coordinate location',
        'occupation',
        'SIMBAD ID',
        'right ascension',
        'declination',
        'epoch',
        'found in taxon',
        'sex or gender',
        'constellation',
        'start time',
        'Google Knowledge Graph ID',
        'VIAF ID',
        'given name',
        'parallax',
        'date of birth',
        'ortholog',
        'Wikimedia import URL',
        'radial velocity',
        'ResearchGate publication ID',
        'Freebase ID',
        'imported from Wikimedia project',
        'country of citizenship',
        'named as',
        'part of',
        'image',
        'GeoNames ID',
        'end time',
        'distance from Earth',
        'based on heuristic',
        'chromosome',
        'exact match',
        'family name',
        'Commons category',
        'subclass of',
        'Entrez Gene ID',
        'parent taxon',
        'taxon rank',
        'taxon name',
        'GNS Unique Feature ID',
        'place of birth',
        'Elo rating',
        'collection',
        'described by source',
        'date of death',
        'UniProt protein ID',
        'inception',
        'ORCID iD',
        'GBIF taxon ID',
        'location',
        'educated at',
        'category combines topics',
        'applies to jurisdiction',
        'languages spoken, written or signed',
        'heritage designation',
        'located in time zone',
        'GND ID',
        'postal code',
        'WorldCat Identities ID',
        'sport',
        'follows',
        'followed by',
        'has part',
        'population',
        'The Peerage person ID',
        'curator',
        'Dimensions Publication ID',
        'member of sports team',
        'employer',
        'genomic assembly',
        'official name']

In [25]:
len(top100)

100

In [34]:
import pandas as pd 
pd.set_option('max_row',1000)


useful_properties = all_wikidata_properties[all_wikidata_properties.property_name.isin(top100)]
useful_properties = useful_properties[useful_properties.DataType == 'WikibaseItem']

useful_properties

Unnamed: 0,property_ID,DataType,property_name,description,examples
5,P17,WikibaseItem,country,sovereign state of this item (not to be used f...,"state, land, sovereign state, host country"
6,P18,WikibaseItem,image,image of relevant illustration of the subject;...,"graph, illustration, screen capture, screensho..."
7,P19,WikibaseItem,place of birth,most specific known (e.g. city instead of coun...,"birth location, birthplace, location of birth,..."
9,P21,WikibaseItem,sex or gender,sex or gender identity of human or animal. For...,"intersex, male, man, woman, sex, gender, femal..."
13,P27,WikibaseItem,country of citizenship,the object is a country that recognizes the su...,"citizenship, (legal) nationality, citizen of, ..."
15,P31,WikibaseItem,instance of,that class of which this subject is a particul...,"member of, type, is a, is a type of, type of, ..."
24,P50,WikibaseItem,author,main creator(s) of a written work (use on work...,"writer, authors, creator, written by, by, writ..."
27,P54,WikibaseItem,member of sports team,sports teams or clubs that the subject current...,"team, sport team, player of, club played for, ..."
30,P59,WikibaseItem,constellation,the area of the celestial sphere of which the ...,part of constellation
34,P69,WikibaseItem,educated at,educational institution attended by subject,"faculty, education, alma mater, alumna of, alu..."


In [35]:
to_remove = ['country', 
             'image',
             'place of birth',
             'sex or gender',
             'country of citizenship',
             'author',
             'member of sports team',
             'constellation',
             'educated at',
             'taxon rank',
             'occupation',
             'employer',
             'located in the administrative territorial entity',
             'imported from Wikimedia project',
             'collection',
             'VIAF ID',
             'GND ID',
             'stated in',
             'location',
             'postal code',
             'page(s)',
             'Entrez Gene ID',
             'UniProt protein ID',
             'DOI',
             'language of work or name',
             'located in time zone',
             'issue',
             'determination method',
             
             
            
            ]

100

# Get wikidata ID

In [81]:
TERM_LIST = '../01_make_matching_list/matching_list.csv'
WIKIDATA_DICT = './wikidata_id.csv'

In [82]:
matching_list = pd.read_csv(TERM_LIST,dtype={'wiki_title': str})

pagetitle_qid_dict = pd.read_csv(WIKIDATA_DICT, delimiter='\t', header=None, index_col=0).to_dict()[1]
matching_list['QID'] = matching_list['term'].map(pagetitle_qid_dict)

In [87]:
matching_list.QID.notna()

0        False
1        False
2        False
3        False
4        False
         ...  
40131    False
40132    False
40133    False
40134    False
40135    False
Name: QID, Length: 40136, dtype: bool

In [84]:
terms = matching_list[matching_list.wiki_title.notna()].wiki_title.values
terms

array(['Water', 'Primary alcohol', 'Alcohol', ..., 'Hip', 'Cell membrane',
       'PGS'], dtype=object)

In [64]:
import requests 
import urllib
import re 
from bs4 import BeautifulSoup

def get_wikidata_id(term):
    
    encoded_term = urllib.parse.quote(term)

    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&ppprop=wikibase_item&redirects=1&titles={encoded_term}"
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser').get_text()
    
    wikidata_id = re.findall('wikibase_item\"\:\"(.*)?\".*', soup)
    
    if wikidata_id != []:
        return wikidata_id[0]

In [65]:
# find wikidata ID for all terms
from tqdm import tqdm

wikidata_ids = []
not_found = []
for term in tqdm(terms):
    wikidata_id = get_wikidata_id(term)
    if wikidata_id != None: 
        wikidata_ids.append(wikidata_id)
    else:
        not_found.append(term)
        
        
        
# add wikititle for each term
for term in tqdm(matching_list[matching_list.wiki_title.isna()].term.values):
    wiki_title = find_wiki_title(term)
    if wiki_title:
        wiki_title_dict.update({term:wiki_title})

100%|██████████| 8573/8573 [36:41<00:00,  3.89it/s]  


In [60]:
not_found

['Ω−ωf']

# Get properties 

In [42]:
def retrieve_value_P(P):
    for i in range(len(all_wikidata_properties)):
        if P == all_wikidata_properties['property_ID'][i]:
            return all_wikidata_properties['property_name'][i]


def retrieve_value_Q(Q):    
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=labels&languages=en&ids={Q}"
    json_response = requests.get(url).json()
    entities = json_response.get('entities')
    
    entity = entities.get(Q)
    if entity:
        labels = entity.get('labels')
        if labels:
            en = labels.get('en')
            if en:
                value = en.get('value')
                return value 


def get_all_ItemProperties(wikidata_item, wikidata_id):
    url = "https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&entity="+wikidata_id
    json_response = requests.get(url).json()

    properties = [*json_response.get('claims').values()]
    
    res = np.empty(shape=[0, 3])
    
    for p in properties:   
        for d in p: 
            dict_ = d['mainsnak']
        
            # ignore if not a wikibase item
            if dict_['datatype'] != 'wikibase-item' or dict_['snaktype'] != 'value':
                continue 
                
            # replace all the wikidataItem ID by wikidataItem name          
            property_value = retrieve_value_Q(dict_['datavalue']['value']['id'])
            
            if property_value is None:
                continue
            
            # find property value in the previous lookup table
            property_ = retrieve_value_P(dict_['property'])
            
            row_to_append = [wikidata_item, property_, property_value]
  
            res = np.append(res, [row_to_append], 0)  
  
    return res 

In [66]:
# remove the entities without wikidata ID
pagetitle = [term for term in terms if term not in not_found]

# conbine the entities and their corresponding ID as dictionary
dict_pagetitle_qid = dict(zip(pagetitle, wikidata_ids))

In [67]:
dict_pagetitle_qid

{'Water': 'Q283',
 'Primary alcohol': 'Q2832210',
 'Alcohol': 'Q156',
 'Molecule': 'Q11369',
 'Absorption': 'Q224058',
 'Statistical mechanics': 'Q188715',
 'Phase transition': 'Q185357',
 'Partition function': 'Q1182682',
 'Spin': 'Q229949',
 'Aluminium': 'Q663',
 'Copper': 'Q753',
 'Shot (pellet)': 'Q278938',
 'Inorganic waste': 'Q98079057',
 'Masonry': 'Q272999',
 'Concrete': 'Q22657',
 'Particle': 'Q1621273',
 'LES': 'Q297648',
 'Personal digital assistant': 'Q162768',
 'Fluid': 'Q102205',
 'DEM (disambiguation)': 'Q351071',
 'Gas': 'Q11432',
 'D2': 'Q232274',
 'INS': 'Q195126',
 'H2': 'Q249260',
 'Neutron spectroscopy': 'Q7003117',
 'MOF': 'Q406030',
 'Weak interaction': 'Q11418',
 'Liquid': 'Q11435',
 'Grain': 'Q2995529',
 'Oxygen': 'Q629',
 'Ion': 'Q36496',
 'Grain growth': 'Q1492573',
 'Variational principle': 'Q745215',
 'Approach': 'Q4781711',
 'Action (physics)': 'Q846785',
 'Lagrangian': 'Q6472692',
 'Hamiltonian mechanics': 'Q477921',
 'Pendulum': 'Q20702',
 'Planet': 'Q63

In [68]:
# check disambiguation page
def is_disambiguation_page(wikidata_id):
    
    url = "https://www.wikidata.org/wiki/" + wikidata_id
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    div = soup.find("div", {"class": "wikibase-entitytermsview-heading-description"}).text
    
    return div == 'Wikimedia disambiguation page'

In [71]:
# check disambiguation page from list entities 
to_remove = []

for term, wikidata_id in tqdm(dict_pagetitle_qid.items()):

    if is_disambiguation_page(wikidata_id):
        to_remove.append(term)

100%|██████████| 6306/6306 [1:17:15<00:00,  1.36it/s]


In [72]:
to_remove

['Absorption',
 'Partition function',
 'Spin',
 'LES',
 'DEM (disambiguation)',
 'D2',
 'INS',
 'H2',
 'MOF',
 'Approach',
 'Lagrangian',
 'Framework',
 'N2',
 'CO',
 'NH 3',
 'SO2',
 'Doping',
 'Reactor',
 'Pipe',
 'RCS',
 'Forcing',
 'Waves',
 'Distribution',
 'Channel',
 'Fragmentation',
 'CDS',
 'Gan',
 'Cell',
 'Benchmark',
 'Hamiltonian',
 'CI',
 'TI',
 'Matrix',
 'Vessel',
 'Cooling system',
 'Saft',
 'Segment',
 'Radical',
 'RM',
 'Flow',
 'P-box',
 'Carbon unit',
 'GFR',
 'XC',
 'DFT',
 'Cladding',
 'DHC',
 'Cis',
 'Query',
 'Model',
 'Pore',
 'Cellular',
 'Crack',
 'QED',
 'Comparative analysis',
 'FM',
 'Fe',
 'Node',
 'DG',
 'Susy',
 'TIS',
 'TM',
 'Ingress',
 'Hooks',
 'Churn',
 'Wasi',
 'Sea bass',
 'BB',
 'Rationalization',
 'Supply',
 'Feature',
 'Method',
 'Ejection',
 'CNTS',
 'Bulk',
 'Weakening',
 'Trapped',
 'Ferrite',
 'Evac',
 'He',
 'SM',
 'Higgs',
 'Reaction',
 'Transience',
 'Mo',
 'Replacement',
 'Compound',
 'CA',
 'SR',
 'BA',
 'GA',
 'GP',
 'Object',
 'Lat

In [73]:
len(to_remove)/len(terms)

0.1231774174734632

In [74]:
dict_pagetitle_qid['German']

'Q348514'

In [78]:
# remove disambiguation page from list entities
for key in to_remove:
    dict_pagetitle_qid.pop(key)

In [80]:
pd.DataFrame([(k,v) for k,v in dict_pagetitle_qid.items()]).to_csv(WIKIDATA_DICT, header=None, index=False, sep='\t')