In [1]:
from qwikidata.sparql  import return_sparql_query_results
import pandas as pd
import numpy as np
import requests

## Getting entities of desired objects

A function making a query to wikidata using SPARQL for getting a list of entities using a string as a name the desired item.

In [2]:
def search_entities(item, limit=10):
    query_string = f"""
    SELECT * WHERE {{
      ?item wdt:P31 ?instance
      SERVICE wikibase:mwapi {{
        bd:serviceParam wikibase:api "EntitySearch" .
        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
        bd:serviceParam mwapi:search "{item}" .
        bd:serviceParam mwapi:language "en" .
        bd:serviceParam mwapi:uselang "en" .
        bd:serviceParam mwapi:limit {limit} .
        ?item wikibase:apiOutputItem mwapi:item .

        ?num wikibase:apiOrdinal true.
      }}
    }} ORDER BY ASC (?num)
    """
#         ?label wikibase:apiOutput "@label" .
#         ?matchType wikibase:apiOutput "match/@type" .
#         ?matchLang wikibase:apiOutput "match/@language" .
#         ?matchText wikibase:apiOutput "match/@text"  .
#         ?description wikibase:apiOutput "@description" .
    res = return_sparql_query_results(query_string)
    return res

In [3]:
res_a = search_entities("Python")
res_b = search_entities("Java")

This function transforms the response from the entity searching query to the dataframe which contains pairs of the following format: Entity - One of the classes of which this entity is an instance. One entity can be an instance of several classes, so some pairs have same entity.

In [5]:
def res2df(res):
    df = pd.DataFrame(columns=['entity', 'instance', 'num'])
    for row in res["results"]["bindings"]:
        entity = row["item"]["value"].split('/')[-1]
        instance = row["instance"]["value"].split('/')[-1]
        num = row["num"]["value"]
        df = df.append({'entity': entity, 'instance': instance, 'num': num}, ignore_index=True)
    return df

In [6]:
df_a = res2df(res_a)
df_b = res2df(res_b)

A function for extracting information about entities listed using a query to MediaWiki API. SPARQL does not give the full list of enitities for some reason. You cn check this by comparing the response received from it with web search page results at wikidata.org. Nevertheless, it can be helpful for getting a list of entities for a requested string.

In [10]:
def get_entities(entities):
    ids = ""
    for entity in entities:
        ids += entity + "|"
    ids = ids[:-1]
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&languages=en&format=json"
    response = requests.get(url).json()
    return response

Extracting the list of entities from json responce received by the previous function.

In [11]:
def get_insts_from_json(json):
    insts = []
    for key, val in json['entities'].items():
        insts_json = val['claims']['P31']
        inst = []
        for inst_json in insts_json:
            inst.append(inst_json['mainsnak']['datavalue']['value']['id'])
        insts.append(inst)
    return insts

A function for getting the supposedly compared entities by the principle of the majotity of coinciding "instances" (the classes of which this entity is an instance).

In [12]:
def get_best_pair(df_a, df_b):
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    insts_a = get_insts_from_json(json_a)
    insts_b = get_insts_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
#         inst_a = df_a[df_a['entity'] == ent_a[ind[0]]]['instance']
#         inst_b = df_b[df_b['entity'] == ent_b[ind[1]]]['instance']
        inst_a = insts_a[ind[0]]
        inst_b = insts_b[ind[1]]
        common_inst = list(set(inst_a) & set(inst_b))
        conc_table[ind] = len(common_inst)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    pair[0] = ent_a[pair[0]]
    pair[1] = ent_b[pair[1]]
    return pair

In [13]:
get_best_pair(df_a, df_b)

['Q28865', 'Q251']

Uniting the process of searching for entities, corresponding to the compared objects, using string names as input,  into one function. String names -> entity ids.

In [14]:
def strings2ids(obj_a, obj_b):
    a = search_entities(obj_a)
    b = search_entities(obj_b)
    a = res2df(a)
    b = res2df(b)
    ids = get_best_pair(a, b)
    return ids

In [15]:
strings2ids("python", "java")

['Q28865', 'Q251']

Let us try another approach based on coinciding pairs of property (entity relation) names.

In [16]:
def get_props_from_json(json):
    props = []
    for key, val in json['entities'].items():
        props_json = val['claims']
        prop = []
        for k in props_json.keys():
            prop.append(k)
        props.append(prop)
    return props

In [17]:
ent_a = df_a['entity'].unique()
json_a = get_entities(ent_a)
props_a = get_props_from_json(json_a)

In [45]:
def get_best_pair(df_a, df_b, criteria='property'):
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    if criteria == 'instance':
        cr_a = get_insts_from_json(json_a)
        cr_b = get_insts_from_json(json_b)
    elif criteria == 'property':
        cr_a = get_props_from_json(json_a)
        cr_b = get_props_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
#         inst_a = df_a[df_a['entity'] == ent_a[ind[0]]]['instance']
#         inst_b = df_b[df_b['entity'] == ent_b[ind[1]]]['instance']
        cur_cr_a = cr_a[ind[0]]
        cur_cr_b = cr_b[ind[1]]
        common_cr = list(set(cur_cr_a) & set(cur_cr_b))
        conc_table[ind] = len(common_cr)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    ent_info_a = json_a['entities'][ent_a[pair[0]]]
    ent_info_b = json_b['entities'][ent_b[pair[1]]]
    return ent_a[pair[0]], ent_b[pair[1]], ent_info_a, ent_info_b

If we look at the concurrernce tables (conc_table variable), we can see that in case of comparing instance the numbers are lower than in case of comparing properties, so outliers are more likely to mess up the choice of entity pair.  

Conc_table for instances:  
[[2. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
   
Conc_table for properties:  
[[28.  9.  4. 15.  2.  3.  2.  2.  3.]  
 [ 4.  6.  2.  4.  2.  3. 12.  2.  4.]  
 [ 8. 13.  3.  5.  2.  4.  3.  2.  4.]  
 [ 3.  3.  4.  2.  1.  2.  2.  1.  2.]  
 [ 9.  3.  2. 11.  2.  2.  2.  2.  3.]  
 [ 4.  5.  2.  5.  2.  3.  3.  2.  4.]  
 [ 1.  2.  1.  1.  2.  1.  1.  1.  1.]  
 [ 2.  1.  1.  2.  1.  1.  1.  1.  1.]  
 [ 3.  3.  3.  3.  1.  3.  2.  1.  2.]  
 [ 2.  1.  1.  1.  1.  1.  1.  1.  1.]]  
  


In [46]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(df_a, df_b, criteria='property')
ent_id_a, ent_id_b

('Q28865', 'Q251')

In [47]:
def strings2ids(obj_a, obj_b, criteria='property'):
    a = search_entities(obj_a)
    b = search_entities(obj_b)
    a = res2df(a)
    b = res2df(b)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(a, b, criteria=criteria)
    return ent_id_a, ent_id_b, ent_info_a, ent_info_b

In [48]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids("python", "java")
ent_id_a, ent_id_b

('Q28865', 'Q251')

## Retreiving aspects???

Based on coinciding properties.

In [60]:
def get_prop_names(ids, json):
    prop_names = []
    for prop_id in ids:
        prop_names.append(json['entities'][prop_id]['labels']['en']['value'])
    return prop_names

In [65]:
def compare_props(ent_info_a, ent_info_b):
    props_a = []
    for k in ent_info_a['claims'].keys():
        props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        props_b.append(k)
    
    common_props = list(set(props_a) & set(props_b))
    uncommon_props = list(set(props_a) ^ set(props_b))
    
    a_json = get_entities(common_props[:50])
    b_json = get_entities(uncommon_props[:50])
    cp_names = get_prop_names(common_props, a_json)
    up_names = get_prop_names(uncommon_props, b_json)
    return cp_names, up_names

In [69]:
cp

['inception',
 'PSH ID',
 'Bibliothèque nationale de France ID',
 'instance of',
 'Wolfram Language entity code',
 'Stack Exchange tag',
 'file extension',
 "topic's main category",
 'official website',
 'different from',
 'Freebase ID',
 'Quora topic ID',
 'named after',
 'YSA ID',
 'influenced by',
 'Microsoft Academic ID',
 'programming paradigm',
 'GND ID',
 'Commons category',
 'designed by',
 'French Vikidia ID',
 'typing discipline',
 'developer',
 'Library of Congress authority ID',
 'copyright license',
 'subreddit',
 'media type',
 'software version identifier']

In [68]:
up

['FAST ID',
 'Zhihu topic ID',
 'part of',
 'Biblioteca Nacional de España ID',
 'hashtag',
 'has quality',
 'country of origin',
 'Treccani ID',
 'bug tracking system',
 'Twitter username',
 'mascot',
 'NE.se ID',
 'ESCO skill ID',
 'subclass of',
 'IdRef ID',
 'movement',
 'logo image',
 'Techopedia ID',
 'IRC channel',
 'history of topic',
 'Open Hub ID',
 'pronunciation audio',
 'Commons gallery',
 'Brockhaus Enzyklopädie online ID',
 'Dewey Decimal Classification',
 'Store norske leksikon ID',
 'use',
 'source code repository',
 'operating system',
 "topic's main template",
 'Encyclopædia Britannica Online ID',
 'BabelNet ID',
 'GitHub username',
 'has part',
 'described by source',
 'programming language']