<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#All-wikidata-properties" data-toc-modified-id="All-wikidata-properties-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>All wikidata properties</a></span></li><li><span><a href="#Get-wikidata-ID" data-toc-modified-id="Get-wikidata-ID-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Get wikidata ID</a></span></li><li><span><a href="#Get-properties" data-toc-modified-id="Get-properties-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get properties</a></span></li><li><span><a href="#Filter-properties" data-toc-modified-id="Filter-properties-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Filter properties</a></span></li></ul></div>

# All wikidata properties

In [1]:
## wikidata property lookup table 

# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """#All properties with descriptions and aliases and types
SELECT ?property ?propertyTypeLabel ?propertyLabel ?propertyDescription ?propertyAltLabel WHERE {
  ?property wikibase:propertyType ?propertyType.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  
}
ORDER BY (xsd:integer(STRAFTER(STR(?property), "P")))"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [2]:
from pandas.io.json import json_normalize

results = get_results(endpoint_url, query)
all_wikidata_properties = json_normalize(results['results']['bindings'])

# get results 
all_wikidata_properties = all_wikidata_properties[['property.value',
                                                   'propertyTypeLabel.value',
                                                   'propertyLabel.value',
                                                   'propertyDescription.value',
                                                   'propertyAltLabel.value']]

all_wikidata_properties.columns = ['property_ID', 'DataType', 'property_name','description','examples']

# change formats
all_wikidata_properties['property_ID'] = all_wikidata_properties['property_ID'].str[31:]
all_wikidata_properties['DataType'] = all_wikidata_properties['DataType'][0][26:]

all_wikidata_properties.head()

Unnamed: 0,property_ID,DataType,property_name,description,examples
0,P6,WikibaseItem,head of government,"head of the executive power of this town, city...","president, chancellor, mayor, prime minister, ..."
1,P10,WikibaseItem,video,"relevant video. For images, use the property P...","animation, media, gif, trailer (Commons)"
2,P14,WikibaseItem,traffic sign,"graphic symbol describing the item, used at th...","highway shield, shield, highway marker, motorw..."
3,P15,WikibaseItem,route map,image of route map at Wikimedia Commons,"schema, highway map, map of route, metro map, ..."
4,P16,WikibaseItem,highway system,system (or specific country specific road type...,"transport network, network of routes, part of ..."


In [3]:
len(all_wikidata_properties)

8813

So as we can see, wikidata has totally 8798 types of properties/relations.

# Get wikidata ID

In [103]:
TERM_LIST = '../01_make_matching_list/matching_list.csv'
WIKIDATA_DICT = './wikidata_id.csv'
WIKIDATA_PROPERTIES = './wikidata_properties.txt'

In [6]:
import pandas as pd 

matching_list = pd.read_csv(TERM_LIST,dtype={'wiki_title': str})

pagetitle_qid_dict = pd.read_csv(WIKIDATA_DICT, delimiter='\t', header=None, index_col=0).to_dict()[1]
matching_list['QID'] = matching_list['wiki_title'].map(pagetitle_qid_dict)

In [7]:
matching_list

Unnamed: 0,term,annotation,df,wiki_title,QID
0,aperture z-scan experiments,Process,scienceie,,
1,1560nm femtosecond laser pulses,Material,scienceie,,
2,optical-chopper,Material,scienceie,,
3,vibrational combination states,Process,scienceie,,
4,non-radiative processes,Process,scienceie,,
...,...,...,...,...,...
40131,cd8+,cell_line,jnlpba,,
40132,cd29+ t cells,cell_line,jnlpba,,
40133,synoviocytes,cell_type,jnlpba,,
40134,antigen-processing and antigen-presenting cells,cell_type,jnlpba,,


In [8]:
# target titles that we are going to find their QID
titles = matching_list[matching_list.wiki_title.notna() & matching_list.QID.isna()].wiki_title.values
titles 

array(['Absorption', 'Partition function', 'Spin', ..., 'All', 'Ie',
       'PGS'], dtype=object)

In [9]:
import requests 
import urllib
import re 
from bs4 import BeautifulSoup

def get_wikidata_id(term):
    
    encoded_term = urllib.parse.quote(term)

    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&ppprop=wikibase_item&redirects=1&titles={encoded_term}"
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser').get_text()
    
    wikidata_id = re.findall('wikibase_item\"\:\"(.*)?\".*', soup)
    
    if wikidata_id != []:
        return wikidata_id[0]

In [11]:
# find wikidata ID for all terms
from tqdm import tqdm


not_found = []
for title in tqdm(titles):
    wikidata_id = get_wikidata_id(title)
    if wikidata_id:
        pagetitle_qid_dict.update({title: wikidata_id})
    else:
        not_found.append(title)

100%|██████████| 1314/1314 [06:15<00:00,  3.50it/s]


In [12]:
not_found

['Ω−ωf', 'Value of']

In [18]:
# remove the entities without wikidata ID
titles = [title for title in titles if title not in not_found]

In [19]:
# check disambiguation page
def is_disambiguation_page(wikidata_id):
    
    url = "https://www.wikidata.org/wiki/" + wikidata_id
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    div = soup.find("div", {"class": "wikibase-entitytermsview-heading-description"}).text
    
    return div == 'Wikimedia disambiguation page'

In [28]:
QIDs = [pagetitle_qid_dict[title] for title in titles]

In [30]:
# check disambiguation page for list terms
to_remove = []

for title, wikidata_id in tqdm(zip(titles, QIDs)):
    if is_disambiguation_page(wikidata_id):
        to_remove.append(title)

1312it [02:01, 10.83it/s]


In [31]:
to_remove

['Absorption',
 'Partition function',
 'Spin',
 'LES',
 'DEM (disambiguation)',
 'D2',
 'INS',
 'H2',
 'MOF',
 'Approach',
 'Lagrangian',
 'Framework',
 'N2',
 'CO',
 'NH 3',
 'SO2',
 'Doping',
 'Reactor',
 'Pipe',
 'RCS',
 'Pipe',
 'Forcing',
 'Waves',
 'Distribution',
 'Channel',
 'Fragmentation',
 'Framework',
 'CDS',
 'Gan',
 'Cell',
 'Benchmark',
 'Hamiltonian',
 'CI',
 'TI',
 'Matrix',
 'Vessel',
 'Cooling system',
 'Saft',
 'Segment',
 'Radical',
 'RM',
 'Flow',
 'P-box',
 'Flow',
 'Carbon unit',
 'GFR',
 'XC',
 'DFT',
 'Cladding',
 'DHC',
 'Cladding',
 'CI',
 'Cis',
 'Query',
 'Model',
 'Pore',
 'Cellular',
 'Crack',
 'QED',
 'Comparative analysis',
 'FM',
 'Fe',
 'Vessel',
 'Node',
 'DG',
 'Susy',
 'TIS',
 'TM',
 'Ingress',
 'Hooks',
 'Churn',
 'Wasi',
 'Sea bass',
 'BB',
 'Rationalization',
 'Supply',
 'Feature',
 'Method',
 'Ejection',
 'Pore',
 'CNTS',
 'Bulk',
 'Weakening',
 'Trapped',
 'Ferrite',
 'Evac',
 'He',
 'SM',
 'SM',
 'Higgs',
 'Reaction',
 'Transience',
 'Mo',
 

In [33]:
len(to_remove)/len(titles)

1.0

In [None]:
# # remove disambiguation page from list entities
# for key in to_remove:
#     dict_pagetitle_qid.pop(key)

In [35]:
pd.DataFrame([(k,v) for k,v in pagetitle_qid_dict.items()]).to_csv(WIKIDATA_DICT, header=None, index=False, sep='\t')

# Get properties 

In [41]:
import numpy as np

def retrieve_value_P(P):
    for i in range(len(all_wikidata_properties)):
        if P == all_wikidata_properties['property_ID'][i]:
            return all_wikidata_properties['property_name'][i]


def retrieve_value_Q(Q):    
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=labels&languages=en&ids={Q}"
    json_response = requests.get(url).json()
    entities = json_response.get('entities')
    
    entity = entities.get(Q)
    if entity:
        labels = entity.get('labels')
        if labels:
            en = labels.get('en')
            if en:
                value = en.get('value')
                return value 


def get_all_ItemProperties(wikidata_item, wikidata_id):
    url = "https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&entity="+wikidata_id
    json_response = requests.get(url).json()

    properties = [*json_response.get('claims').values()]
    
    res = np.empty(shape=[0, 3])
    
    for p in properties:   
        for d in p: 
            dict_ = d['mainsnak']
        
            # ignore if not a wikibase item
            if dict_['datatype'] != 'wikibase-item' or dict_['snaktype'] != 'value':
                continue 
                
            # replace all the wikidataItem ID by wikidataItem name          
            property_value = retrieve_value_Q(dict_['datavalue']['value']['id'])
            
            if property_value is None:
                continue
            
            # find property value in the previous lookup table
            property_ = retrieve_value_P(dict_['property'])
            
            row_to_append = [wikidata_item, property_, property_value]
  
            res = np.append(res, [row_to_append], 0)  
  
    return res 

In [42]:
get_all_ItemProperties(wikidata_item='algorithm', wikidata_id='Q8366') 

array([['algorithm', 'subclass of', 'procedure'],
       ['algorithm', 'subclass of', 'work'],
       ['algorithm', 'named after', 'Al-Khwarizmi'],
       ['algorithm', 'part of', 'computer science'],
       ['algorithm', 'part of', 'algorithmics'],
       ['algorithm', 'part of', 'mathematics'],
       ['algorithm', "topic's main category", 'Category:Algorithms'],
       ['algorithm', "topic's main template",
        'Template:Infobox algorithm'],
       ['algorithm', 'described by source', 'Ottův slovník naučný'],
       ['algorithm', 'described by source',
        'The Art of Computer Programming, Volume 1: Fundamental Algorithms, 3rd edition'],
       ['algorithm', 'described by source', 'Gujin Tushu Jicheng'],
       ['algorithm', 'on focus list of Wikimedia project',
        'Wikipedia:Vital articles'],
       ['algorithm', 'maintained by WikiProject',
        'WikiProject Computer science'],
       ['algorithm', 'has quality', 'computational complexity'],
       ['algorithm', 'h

In [None]:
# find related pairs for all items in our dictionary
if WIKIDATA_PROPERTIES:
    related_pairs = np.loadtxt('./wikidata_properties.txt', delimiter='\t', dtype='str')
else:
    related_pairs = np.empty(shape=[0, 3])

In [43]:
for item, Qid in tqdm(pagetitle_qid_dict.items()):
    list_to_append = get_all_ItemProperties(item, Qid)
    related_pairs = np.vstack((related_pairs,list_to_append))

100%|██████████| 6306/6306 [6:30:42<00:00,  3.72s/it]     


In [69]:
# check how many of them do have properties

len(np.unique(related_pairs[:,0]))/len(pagetitle_qid_dict.keys())

0.9628924833491912

# Filter properties

In [53]:
# the most frequent wikidata properties for our terms

unique_properties, counts_properties = np.unique(related_pairs[:,1], return_counts=True)

In [64]:
pd.set_option('max_row',1000)

property_df = pd.DataFrame(list(zip(unique_properties, counts_properties)),
                           columns =['property', 'count'])

property_df = property_df.sort_values(by=['count'], ascending=False)
property_df.head(100)

Unnamed: 0,property,count
242,found in taxon,8307
303,instance of,7095
523,subclass of,5538
195,drug used for treatment,3114
412,part of,3062
167,described by source,2500
269,has part,2453
353,medical condition treated,2288
248,genetic association,1925
70,biological process,1875


In [68]:
# filter only the properties that are useful to us 
property_toremove = ['described by source',
                     'country',
                     'winner',
                     'owner of',
                     'contains administrative territorial entity',
                     'discoverer or inventor',
                     'language used',
                     'language of work or name',
                     'participant',
                     'headquarters location',
                     'indexed in bibliographic review',
                     'cast member',
                     'award received',
                     'founded by',
                     'chairperson',
                     'diplomatic relation',
                     'country of origin',
                     'located in the administrative territorial entity',
                     'LiverTox likelihood score',
                     'publisher',
                     'safety classification and labelling',
                     'has works in the collection',
                     'practiced by',
                     'medical examinations',
                     'operating system',
                     'affiliation',
                     'owned by',
                     'programming language',
                     'developer',
                     'developer',
                     'platform',
                     'shares border with',
                     'has edition or translation',
                     'history of topic',
                     'site of astronomical discovery',
                     'copyright license',
                     'afflicts',
                     'child',
                     'nominated for',
                     'official language'
                    ]

In [90]:
related_pairs_filtered = np.array([pair for pair in related_pairs 
                                   if pair[1] not in property_toremove 
                                   and pair[1] in property_df.property.values[:100].tolist()])

In [91]:
len(np.unique(related_pairs_filtered[:,0]))/len(pagetitle_qid_dict.keys())

0.9606723755153822

In [95]:
np.savetxt('./wikidata_properties.txt', related_pairs, delimiter='\t', fmt='%s')