In [1]:
import pandas as pd
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
import pickle

## Getting entities of desired objects

In [2]:
def search_entities(item, limit=10, n_retries=10):
    """
    Makes a query to wikidata using SPARQL for
    getting a list of entities.
    ---
    
    Parameters
    ---
    
    item - name for which entities are required.
    
    limit  - maximum number of entities to request.
     
    n_retries - maximum number of repeating the
    request in case of failure.
     
    
    Returns
    ---
    
    res - data retreived from json response (entities).
    """
    
    item = item.lower()
    query_string = f"""
    SELECT * WHERE {{
      ?item wdt:P31 ?instance
      SERVICE wikibase:mwapi {{
        bd:serviceParam wikibase:api "EntitySearch" .
        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
        bd:serviceParam mwapi:search "{item}" .
        bd:serviceParam mwapi:language "en" .
        bd:serviceParam mwapi:uselang "en" .
        bd:serviceParam mwapi:limit {limit} .
        ?item wikibase:apiOutputItem mwapi:item .

        ?num wikibase:apiOrdinal true.
      }}
    }} ORDER BY ASC (?num)
    """
    for retry in range(n_retries + 1):
        if retry > 0:
            print(f"retrying: attempt {retry}")
        try:
            res = requests.get("https://query.wikidata.org/sparql", params={"query": query_string, "format": "json"})
        except Exception as e:
            print(f"{type(e).__name__} for {item}")
            continue
        if res.status_code != 200:
            print(f"Response {res.status_code} for {item}.")
            if retry == n_retries:
                return {'results': {'bindings': []}}
            continue
        res = res.json()
        if not res["results"]["bindings"]:
            print(f"Zero entities for {item}.")
            continue
        if retry > 0:
            print("ok")
        break
    return res

def res2df(res):
    """
    Transforms the response from the entity searching
    query to the dataframe. 

    ---
    
    Parameters
    ---
    
    res - response from the function search_entities.
     
    
    Returns
    ---
    
    df - dataframe which contains pairs of the following
    format: entity - one of the classes of which this
    entity is an instance. One entity can be an instance
    of several classes, so some pairs have same entity.
    """
    
    df = pd.DataFrame(columns=['entity', 'instance', 'num'])
    for row in res["results"]["bindings"]:
        entity = row["item"]["value"].split('/')[-1]
        instance = row["instance"]["value"].split('/')[-1]
        num = row["num"]["value"]
        df = df.append({'entity': entity, 'instance': instance, 'num': num}, ignore_index=True)
    return df

def get_entities(entities):
    """
    Extracts information about entities listed using a
    query to MediaWiki API.
    ---
    
    Parameters
    ---
    
    entities - a list of entity ids.
     
    
    Returns
    ---
    
    response - data retreived from json response
    (information about entities).
    """
    
    ids = ""
    for entity in entities:
        ids += entity + "|"
    ids = ids[:-1]
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&languages=en&format=json"
    response = requests.get(url).json()
    return response

def get_insts_from_json(json):
    """
    Extracts the list of entities from json response
    received by the function get_entities.
    ---
    
    Parameters
    ---
    
    json - parsed json received from get_entities.
     
    
    Returns
    ---
    
    insts - a list of entity ids corresponding to
    the classes the instance of which the current
    object (entity) is.
    """
    
    insts = []
    for key, val in json['entities'].items():
        insts_json = val['claims']['P31']
        inst = []
        for inst_json in insts_json:
            inst.append(inst_json['mainsnak']['datavalue']['value']['id'])
        insts.append(inst)
    return insts

def get_props_from_json(json):
    """
    Extracts the list of properties from json
    response received by the function get_entities.
    ---
    
    Parameters
    ---
    
    json - parsed json received from get_entities.
     
    
    Returns
    ---
    
    props - a list of property ids of the current
    object (entity).
    """
    
    props = []
    for key, val in json['entities'].items():
        props_json = val['claims']
        prop = []
        for k in props_json.keys():
            prop.append(k)
        props.append(prop)
    return props

def get_best_pair(df_a, df_b, criteria='property'):
    """
    Retreives the supposedly compared entities by
    the principle of the majotity of coinciding
    "instances" (the classes of which this entity
    is an instance) or properties.
    ---
    
    Parameters
    ---
    
    df_a - DataFrame from res2df function containing
    entity ids of possible entities of object a.
    
    df_b  - DataFrame from res2df function containing
    entity ids of possible entities of object b.
     
    criteria - the criteria according to which the
    entity pair will be selected. The possible ones
    are 'property' and 'instance'.
     
    
    Returns
    ---
    
    entity id of object a.
    
    entity id of object b.
    
    data about object a collected from wikidata.
    
    data about object b collected from wikidata.
    """
    
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    if 'entities' not in json_a:
        if 'entities' not in json_b:
            print('No entities for objects a and b')
            return '', '', {'claims': {}}, {'claims': {}}
        else:
            print('No entities for object a')
            return '', ent_b[0], {'claims': {}}, json_b['entities'][ent_b[0]]
    elif 'entities' not in json_b:
        print('No entities for object b')
        return ent_a[0], '', json_a['entities'][ent_a[0]], {'claims': {}}
    if criteria == 'instance':
        cr_a = get_insts_from_json(json_a)
        cr_b = get_insts_from_json(json_b)
    elif criteria == 'property':
        cr_a = get_props_from_json(json_a)
        cr_b = get_props_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
        cur_cr_a = cr_a[ind[0]]
        cur_cr_b = cr_b[ind[1]]
        common_cr = list(set(cur_cr_a) & set(cur_cr_b))
        conc_table[ind] = len(common_cr)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    ent_info_a = json_a['entities'][ent_a[pair[0]]]
    ent_info_b = json_b['entities'][ent_b[pair[1]]]
    return ent_a[pair[0]], ent_b[pair[1]], ent_info_a, ent_info_b

def strings2ids(obj_a, obj_b, criteria='property', n_retries=100):
    """
    Getting entity ids of wikidata having only
    string names of objects.
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
     
    criteria - the criteria according to which the
    entity pair will be selected. The possible ones
    are 'property' and 'instance'.
    
    n_retries - maximum number of repeating the
    request in case of failure.
     
    
    Returns
    ---
    
    ent_id_a - entity id of object a.
    
    ent_id_b - entity id of object b.
    
    ent_info_a - data about object a collected
    from wikidata.
    
    ent_info_b - data about object b collected
    from wikidata.
    """
    
    a = search_entities(obj_a, n_retries=n_retries)
    b = search_entities(obj_b, n_retries=n_retries)
    a = res2df(a)
    b = res2df(b)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(a, b, criteria=criteria)
    return ent_id_a, ent_id_b, ent_info_a, ent_info_b

def get_prop_names(ids, json):
    """
    Get names of properties having information
    from wikidata about them.
    ---
    
    Parameters
    ---
    
    ids - entity ids the names for which are
    needed.
    
    json - parsed json received from get_entities.
    Ids for which names are needed must be among
    the requested entities.
     
    
    Returns
    ---
    
    prop_names - list of names of entities with
    requested ids.
    """
    
    prop_names = []
    for prop_id in ids:
        p = json['entities'][prop_id]['labels']['en']['value'].lower()
        prop_names.append(p)
    return prop_names

def prop_filter(prop):
    """
    Filter function for properties.
    ---
    
    Parameters
    ---
    
    prop - property list gathered from wikidata
    entity.
     
    
    Returns
    ---
    
    True if the property is ok or False if the
    property is not to go further.
    """
    bad_datatypes = [
        "external-id",
        "commonsMedia",
        "url",
        "globe-coordinate",
        "wikibase-sense",
        "wikibase-property",
        "wikibase-lexeme",
        "string"
    ]
    datatype = prop[0]['mainsnak']['datatype']
    if datatype in bad_datatypes:
        return False
    return True

def get_values(ent_info, prop_name):
    """
    Retreives values of a specific property
    from the entity data gathered from wikidata.
    ---
    
    Parameters
    ---
    
    ent_info - data about an object collected
    from wikidata.
    
    prop_name - required property id.
     
    
    Returns
    ---
    
    vals - a list of values for the requested
    property.
    """
    
    vals = []
    for val in ent_info['claims'][prop_name]:
        datatype = val['mainsnak']['datatype']
        if datatype == 'wikibase-item':
            vals.append(val['mainsnak']['datavalue']['value']['id'])
        # ignore time datatype
    return vals

def compare_props(ent_info_a, ent_info_b, prop_filter=lambda p: True):
    """
    Compares two entities by their properties and
    returns the lists with the names of properties
    according to the rules.
    ---
    
    Parameters
    ---
    
    ent_info_a - data about object a collected
    from wikidata.
    
    ent_info_b - data about object b collected
    from wikidata.
    
    prop_filter - function for filtering
    properties. No filtering by default.
     
    
    Returns
    ---
    
    mv_names - a list of names of common properties
    with matching values.
    
    nmv_names - a list of names of common properties
    with non-matching values.
    
    up_names_a - names of properties, unique for
    object a.
    
    up_names_b - names of properties, unique for
    object b.
    """
    
    props_a = []
    for k in ent_info_a['claims'].keys():
        if prop_filter(ent_info_a['claims'][k]):
            props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        if prop_filter(ent_info_b['claims'][k]):
            props_b.append(k)
    
    common_props = list(set(props_a) & set(props_b))
    m_vals = []
    nm_vals = []
    for common_prop in common_props:
        vals_a = get_values(ent_info_a, common_prop)
        vals_b = get_values(ent_info_b, common_prop)
        if len(set(vals_a) & set(vals_b)) > 0:
            m_vals.append(common_prop)
        else:
            nm_vals.append(common_prop)
    uncommon_props_a = list(set(props_a) - set(common_props))
    uncommon_props_b = list(set(props_b) - set(common_props))
    
    mv_json = get_entities(m_vals[:50])
    nmv_json = get_entities(nm_vals[:50])
    up_a_json = get_entities(uncommon_props_a[:50])
    up_b_json = get_entities(uncommon_props_b[:50])
    mv_names = get_prop_names(m_vals[:50], mv_json)
    nmv_names = get_prop_names(nm_vals[:50], nmv_json)
    up_names_a = get_prop_names(uncommon_props_a[:50], up_a_json)
    up_names_b = get_prop_names(uncommon_props_b[:50], up_b_json)
    return mv_names, nmv_names, up_names_a, up_names_b

def print_findings(obj_a, obj_b, mv, nmv, up_a, up_b):
    """
    Compares two entities by their properties and
    returns the lists with the names of properties
    according to the rules.
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
    
    mv_names - a list of names of common properties
    with matching values.
    
    nmv_names - a list of names of common properties
    with non-matching values.
    
    up_names_a - names of properties, unique for
    object a.
    
    up_names_b - names of properties, unique for
    object b.
    """
    
    print(f"{'-' * 50}\n{obj_a} vs {obj_b}\n{'-' * 50}")
    print(f"Common properties with matching values:\n{', '.join(mv)}.\n")
    print(f"Common properties with non-matching values:\n{', '.join(nmv)}.\n")
    print(f"Original properties for {obj_a}:\n{', '.join(up_a)}.\n")
    print(f"Original properties for {obj_b}:\n{', '.join(up_b)}.\n")
    
def wd_compare(obj_a, obj_b):
    """
    One-liner for searching wikidata for enities
    corresponding to names of objects a and b,
    selecting the best pair, retreiving
    properties, comparing them, getting their
    names and printing the results.
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
    """
    
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    mv, nmv, up_a, up_b = compare_props(ent_info_a, ent_info_b, prop_filter=prop_filter)
    print_findings(obj_a, obj_b, mv, nmv, up_a, up_b)

In [3]:
wd_compare("Python", "Java")

--------------------------------------------------
Python vs Java
--------------------------------------------------
Common properties with matching values:
instance of, programming paradigm, influenced by.

Common properties with non-matching values:
inception, topic's main category, designed by, named after, different from, developer, typing discipline, copyright license.

Original properties for Python:
history of topic, topic's main template, operating system, movement, programming language, use, has part.

Original properties for Java:
has quality, part of, described by source, mascot, subclass of, country of origin.



# Ranking of Wikidata aspects

In [None]:
import json

with open('mined_bow_str.json', 'r') as f:
    contents = f.readlines()

data = []
for line in contents:
    data.append(json.loads(line))

## Using Elasticsearch

In [None]:
def request_elasticsearch(objects, user='reader', password='reader'):
    url = 'http://ltdemos.informatik.uni-hamburg.de/depcc-index/_search?q=text:\"'
    url += '\"%20AND%20\"'.join(objects)

    size = 0
    
    url += '\"&from=0&size={}'.format(size)
    response = requests.get(url, auth=HTTPBasicAuth(user, password))
    return response

### Aspect NAMES for a pair of objects

In [None]:
# freq_list = []
start = 169

for ind, pair in enumerate(data[start:]):
    obj_a, obj_b = pair['object1']['name'], pair['object2']['name']
    print(f"{start + ind}. {obj_a} vs {obj_b}")
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    mv, nmv, up_a, up_b = compare_props(ent_info_a, ent_info_b, prop_filter=prop_filter)
    
    aspect_candidates = mv + nmv
    freqs = {}
    
    for ac in aspect_candidates:
        response = request_elasticsearch([obj_a, obj_b, ac]).json()
        freqs[ac] = response['hits']['total']
    
    sorted_freqs = sorted(freqs.items(), key=lambda item: item[1], reverse=True)
    row = [obj_a, obj_b] + [(name + "_" + str(freq)) for name, freq in sorted_freqs]
    
    freq_list.append(row)
    if len(freq_list) != 0 and (len(freq_list) % 10) == 0:
        print("SAVING")
        with open('checkpoints/ES_pair_names_' + str(len(freq_list)) + '.csv', 'w') as f:
            for pair in freq_list:
                f.write(','.join(pair) + '\n')

In [None]:
with open('ES_pair_names.csv', 'w') as f:
    for pair in freq_list:
        f.write(','.join(pair) + '\n')