In [1]:
import pandas as pd
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
import pickle
import urllib
import tqdm

## Getting entities of desired objects

In [2]:
def search_entities(item, limit=10, n_retries=10):
    """
    Makes a query to wikidata using SPARQL for
    getting a list of entities.
    ---
    
    Parameters
    ---
    
    item - name for which entities are required.
    
    limit  - maximum number of entities to request.
     
    n_retries - maximum number of repeating the
    request in case of failure.
     
    
    Returns
    ---
    
    res - data retreived from json response (entities).
    """
    
    item = item.lower()
    query_string = f"""
    SELECT * WHERE {{
      ?item wdt:P31 ?instance
      SERVICE wikibase:mwapi {{
        bd:serviceParam wikibase:api "EntitySearch" .
        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
        bd:serviceParam mwapi:search "{item}" .
        bd:serviceParam mwapi:language "en" .
        bd:serviceParam mwapi:uselang "en" .
        bd:serviceParam mwapi:limit {limit} .
        ?item wikibase:apiOutputItem mwapi:item .

        ?num wikibase:apiOrdinal true.
      }}
    }} ORDER BY ASC (?num)
    """
    for retry in range(n_retries + 1):
        if retry > 0:
            print(f"retrying: attempt {retry}")
        try:
            res = requests.get("https://query.wikidata.org/sparql", params={"query": query_string, "format": "json"})
        except Exception as e:
            print(f"{type(e).__name__} for {item}")
            continue
        if res.status_code != 200:
            print(f"Response {res.status_code} for {item}.")
            if retry == n_retries:
                return {'results': {'bindings': []}}
            continue
        res = res.json()
        if not res["results"]["bindings"]:
            print(f"Zero entities for {item}.")
            continue
        if retry > 0:
            print("ok")
        break
    return res

def res2df(res):
    """
    Transforms the response from the entity searching
    query to the dataframe. 

    ---
    
    Parameters
    ---
    
    res - response from the function search_entities.
     
    
    Returns
    ---
    
    df - dataframe which contains pairs of the following
    format: entity - one of the classes of which this
    entity is an instance. One entity can be an instance
    of several classes, so some pairs have same entity.
    """
    
    df = pd.DataFrame(columns=['entity', 'instance', 'num'])
    for row in res["results"]["bindings"]:
        entity = row["item"]["value"].split('/')[-1]
        instance = row["instance"]["value"].split('/')[-1]
        num = row["num"]["value"]
        df = df.append({'entity': entity, 'instance': instance, 'num': num}, ignore_index=True)
    return df

def get_entities(entities):
    """
    Extracts information about entities listed using a
    query to MediaWiki API.
    ---
    
    Parameters
    ---
    
    entities - a list of entity ids.
     
    
    Returns
    ---
    
    response - data retreived from json response
    (information about entities).
    """
    
    batch_size = 50
    response_accumulator = {'entities': {}}
    for batch_num in range(len(entities) // batch_size):
        ids = "|".join(entities[(batch_num * batch_size):((batch_num + 1) * batch_size)])
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&languages=en&format=json"
        response = requests.get(url).json()
        for key, val in response['entities'].items():
            response_accumulator['entities'][key] = val
    if (len(entities) % batch_size) != 0:
        ids = "|".join(entities[(len(entities) - (len(entities) % batch_size)):])
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&languages=en&format=json"
        response = requests.get(url).json()
        for key, val in response['entities'].items():
            response_accumulator['entities'][key] = val
    return response_accumulator

def get_insts_from_json(json):
    """
    Extracts the list of entities from json response
    received by the function get_entities.
    ---
    
    Parameters
    ---
    
    json - parsed json received from get_entities.
     
    
    Returns
    ---
    
    insts - a list of entity ids corresponding to
    the classes the instance of which the current
    object (entity) is.
    """
    
    insts = []
    for key, val in json['entities'].items():
        insts_json = val['claims']['P31']
        inst = []
        for inst_json in insts_json:
            inst.append(inst_json['mainsnak']['datavalue']['value']['id'])
        insts.append(inst)
    return insts

def get_props_from_json(json):
    """
    Extracts the list of properties from json
    response received by the function get_entities.
    ---
    
    Parameters
    ---
    
    json - parsed json received from get_entities.
     
    
    Returns
    ---
    
    props - a list of property ids of the current
    object (entity).
    """
    
    props = []
    for key, val in json['entities'].items():
        props_json = val['claims']
        prop = []
        for k in props_json.keys():
            prop.append(k)
        props.append(prop)
    return props

def get_best_pair(df_a, df_b, criteria='property'):
    """
    Retreives the supposedly compared entities by
    the principle of the majotity of coinciding
    "instances" (the classes of which this entity
    is an instance) or properties.
    ---
    
    Parameters
    ---
    
    df_a - DataFrame from res2df function containing
    entity ids of possible entities of object a.
    
    df_b  - DataFrame from res2df function containing
    entity ids of possible entities of object b.
     
    criteria - the criteria according to which the
    entity pair will be selected. The possible ones
    are 'property' and 'instance'.
     
    
    Returns
    ---
    
    entity id of object a.
    
    entity id of object b.
    
    data about object a collected from wikidata.
    
    data about object b collected from wikidata.
    """
    
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    if 'entities' not in json_a:
        if 'entities' not in json_b:
            print('No entities for objects a and b')
            return '', '', {'claims': {}}, {'claims': {}}
        else:
            print('No entities for object a')
            return '', ent_b[0], {'claims': {}}, json_b['entities'][ent_b[0]]
    elif 'entities' not in json_b:
        print('No entities for object b')
        return ent_a[0], '', json_a['entities'][ent_a[0]], {'claims': {}}
    if criteria == 'instance':
        cr_a = get_insts_from_json(json_a)
        cr_b = get_insts_from_json(json_b)
    elif criteria == 'property':
        cr_a = get_props_from_json(json_a)
        cr_b = get_props_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
        cur_cr_a = cr_a[ind[0]]
        cur_cr_b = cr_b[ind[1]]
        common_cr = list(set(cur_cr_a) & set(cur_cr_b))
        conc_table[ind] = len(common_cr)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    ent_info_a = json_a['entities'][ent_a[pair[0]]]
    ent_info_b = json_b['entities'][ent_b[pair[1]]]
    return ent_a[pair[0]], ent_b[pair[1]], ent_info_a, ent_info_b

def strings2ids(obj_a, obj_b, criteria='property', n_retries=100):
    """
    Getting entity ids of wikidata having only
    string names of objects.
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
     
    criteria - the criteria according to which the
    entity pair will be selected. The possible ones
    are 'property' and 'instance'.
    
    n_retries - maximum number of repeating the
    request in case of failure.
     
    
    Returns
    ---
    
    ent_id_a - entity id of object a.
    
    ent_id_b - entity id of object b.
    
    ent_info_a - data about object a collected
    from wikidata.
    
    ent_info_b - data about object b collected
    from wikidata.
    """
    
    a = search_entities(obj_a, n_retries=n_retries)
    b = search_entities(obj_b, n_retries=n_retries)
    a = res2df(a)
    b = res2df(b)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(a, b, criteria=criteria)
    return ent_id_a, ent_id_b, ent_info_a, ent_info_b

def get_prop_names(ids, json):
    """
    Get names of properties having information
    from wikidata about them.
    ---
    
    Parameters
    ---
    
    ids - entity ids the names for which are
    needed.
    
    json - parsed json received from get_entities.
    Ids for which names are needed must be among
    the requested entities.
     
    
    Returns
    ---
    
    prop_names - list of names of entities with
    requested ids.
    """
    
    prop_names = []
    for prop_id in ids:
        if 'en' in json['entities'][prop_id]['labels']:
            p = json['entities'][prop_id]['labels']['en']['value'].lower()
        else:
            p = "no_label"
        prop_names.append(p)
    return prop_names

def prop_filter(prop):
    """
    Filter function for properties.
    ---
    
    Parameters
    ---
    
    prop - property list gathered from wikidata
    entity.
     
    
    Returns
    ---
    
    True if the property is ok or False if the
    property is not to go further.
    """
    bad_datatypes = [
        "external-id",
        "commonsMedia",
        "url",
        "globe-coordinate",
        "wikibase-sense",
        "wikibase-property",
        "wikibase-lexeme",
        "string"
    ]
    datatype = prop[0]['mainsnak']['datatype']
    if datatype in bad_datatypes:
        return False
    return True

def get_values(ent_info, prop_name):
    """
    Retreives values of a specific property
    from the entity data gathered from wikidata.
    ---
    
    Parameters
    ---
    
    ent_info - data about an object collected
    from wikidata.
    
    prop_name - required property id.
     
    
    Returns
    ---
    
    vals - a list of values for the requested
    property.
    """
    
    vals = []
    for val in ent_info['claims'][prop_name]:
        datatype = val['mainsnak']['datatype']
        if datatype == 'wikibase-item':
            if 'datavalue' in val['mainsnak']:
                vals.append(val['mainsnak']['datavalue']['value']['id'])
        # ignore time datatype
    return vals

def compare_props(ent_info_a, ent_info_b, prop_filter=lambda p: True):
    """
    Compares two entities by their properties and
    returns the lists with the names of properties
    according to the rules.
    ---
    
    Parameters
    ---
    
    ent_info_a - data about object a collected
    from wikidata.
    
    ent_info_b - data about object b collected
    from wikidata.
    
    prop_filter - function for filtering
    properties. No filtering by default.
     
    
    Returns
    ---
    
    mv_names - a list of names of common properties
    with matching values.
    
    nmv_names - a list of names of common properties
    with non-matching values.
    
    up_names_a - names of properties, unique for
    object a.
    
    up_names_b - names of properties, unique for
    object b.
    """
    
    props_a = []
    for k in ent_info_a['claims'].keys():
        if prop_filter(ent_info_a['claims'][k]):
            props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        if prop_filter(ent_info_b['claims'][k]):
            props_b.append(k)
    
    common_props = list(set(props_a) & set(props_b))
    m_vals = []
    nm_vals = []
    for common_prop in common_props:
        vals_a = get_values(ent_info_a, common_prop)
        vals_b = get_values(ent_info_b, common_prop)
        if len(set(vals_a) & set(vals_b)) > 0:
            m_vals.append(common_prop)
        else:
            nm_vals.append(common_prop)
    uncommon_props_a = list(set(props_a) - set(common_props))
    uncommon_props_b = list(set(props_b) - set(common_props))
    
    mv_json = get_entities(m_vals)
    nmv_json = get_entities(nm_vals)
    up_a_json = get_entities(uncommon_props_a)
    up_b_json = get_entities(uncommon_props_b)
    mv_names = get_prop_names(m_vals, mv_json)
    nmv_names = get_prop_names(nm_vals, nmv_json)
    up_names_a = get_prop_names(uncommon_props_a, up_a_json)
    up_names_b = get_prop_names(uncommon_props_b, up_b_json)
    return mv_names, nmv_names, up_names_a, up_names_b

def print_findings(obj_a, obj_b, mv, nmv, up_a, up_b):
    """
    Compares two entities by their properties and
    returns the lists with the names of properties
    according to the rules.
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
    
    mv_names - a list of names of common properties
    with matching values.
    
    nmv_names - a list of names of common properties
    with non-matching values.
    
    up_names_a - names of properties, unique for
    object a.
    
    up_names_b - names of properties, unique for
    object b.
    """
    
    print(f"{'-' * 50}\n{obj_a} vs {obj_b}\n{'-' * 50}")
    print(f"Common properties with matching values:\n{', '.join(mv)}.\n")
    print(f"Common properties with non-matching values:\n{', '.join(nmv)}.\n")
    print(f"Original properties for {obj_a}:\n{', '.join(up_a)}.\n")
    print(f"Original properties for {obj_b}:\n{', '.join(up_b)}.\n")
    
def wd_compare(obj_a, obj_b):
    """
    One-liner for searching wikidata for enities
    corresponding to names of objects a and b,
    selecting the best pair, retreiving
    properties, comparing them, getting their
    names and printing the results.
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
    """
    
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    mv, nmv, up_a, up_b = compare_props(ent_info_a, ent_info_b, prop_filter=prop_filter)
    print_findings(obj_a, obj_b, mv, nmv, up_a, up_b)

In [70]:
wd_compare("Python", "Java")

--------------------------------------------------
Python vs Java
--------------------------------------------------
Common properties with matching values:
influenced by, instance of, programming paradigm.

Common properties with non-matching values:
typing discipline, named after, developer, designed by, copyright license, topic's main category, inception, different from.

Original properties for Python:
movement, operating system, has part, topic's main template, programming language, history of topic, use.

Original properties for Java:
mascot, part of, described by source, subclass of, country of origin, has quality.



# Ranking of Wikidata aspects

In [4]:
data = pd.read_csv("CAM_reference_answers.tsv", sep='\t', header=None, usecols=[0, 1])

## Using Elasticsearch

In [5]:
def request_elasticsearch(objects, user='reader', password='reader', n_retries=10):
    url = 'http://ltdemos.informatik.uni-hamburg.de/depcc-index/_search?q=text:'
    query = '\"' + '\" AND \"'.join(objects) + '\"'
    query = urllib.parse.quote(query)
    url += query

    size = 0
    
    url += '&from=0&size={}'.format(size)
    
    for retry in range(n_retries + 1):
        if retry > 0:
            print(f"retrying: attempt {retry}")
        try:
            res = requests.get(url, auth=HTTPBasicAuth(user, password))
        except Exception as e:
            print(f"{type(e).__name__} for ES request")
            if retry == n_retries:
                print(f"No response for {objects}")
                return {'hits': {'total': 0}}
            continue
        if res.status_code != 200:
            print(f"Response {res.status_code} for ES request.")
            if retry == n_retries:
                print(f"No response for {objects}")
                return {'hits': {'total': 0}}
            continue
        if retry > 0:
            print("ok")
        break
    return res.json()

### Gathering frequencies from DepCC

In [6]:
def save_var(file_name, var):
    with open(file_name, "wb") as pickle_out:
        pickle.dump(var, pickle_out)
        
def read_var(file_name):
    with open(file_name, "rb") as pickle_in:
        var = pickle.load(pickle_in)
    return var
        
def save_csv(file_name, list_to_save):
    with open(file_name, 'w') as f:
        for pair in list_to_save:
            f.write(','.join(pair) + '\n')

In [7]:
def count_freqs(obj, ids_list):
    ent_json = get_entities(ids_list)
    aspect_candidates = get_prop_names(ids_list, ent_json)
    
    freqs = {}
    
    for ac in tqdm.tqdm(aspect_candidates):
        response = request_elasticsearch([obj, ac])
        freqs[ac] = response['hits']['total']
    
    sorted_freqs = sorted(freqs.items(), key=lambda item: item[1], reverse=True)
    row = [obj] + [(name + "_" + str(freq)) for name, freq in sorted_freqs]
    
    return row

In [None]:
start = 0

if start == 0:
    prop_freq_list = []
    val_freq_list = []

for row in data[start:].iterrows():
    ind = row[0]
    obj_a, obj_b = row[1][0], row[1][1]
    versus = f"{obj_a} vs {obj_b}"
    print(f"{ind}. {obj_a} vs {obj_b}")
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    
    props_a = []
    vals_a = []
    for k in ent_info_a['claims'].keys():
        if prop_filter(ent_info_a['claims'][k]):
            props_a.append(k)
            vals_a += get_values(ent_info_a, k)
        
    props_b = []
    vals_b = []
    for k in ent_info_b['claims'].keys():
        if prop_filter(ent_info_b['claims'][k]):
            props_b.append(k)
            vals_b += get_values(ent_info_b, k)
    
    prop_row_a = count_freqs(obj_a, props_a)
    prop_row_b = count_freqs(obj_b, props_b)
    val_row_a = count_freqs(obj_a, vals_a)
    val_row_b = count_freqs(obj_b, vals_b)
    
    prop_freq_list.append([versus] + prop_row_a)
    prop_freq_list.append([versus] + prop_row_b)
    val_freq_list.append([versus] + val_row_a)
    val_freq_list.append([versus] + val_row_b)

    if ind != 0 and (ind % 10) == 0:
        print("SAVING")
        save_var('checkpoints/lists_' + str(len(prop_freq_list)) + '.pickle',
                 [prop_freq_list, val_freq_list])
        save_csv('checkpoints/ES_prop_freqs.csv', prop_freq_list)
        save_csv('checkpoints/ES_val_freqs.csv', val_freq_list)

In [9]:
save_csv('ES_prop_freqs.csv', prop_freq_list)
save_csv('ES_val_freqs.csv', val_freq_list)
save_var('lists.pickle', [prop_freq_list, val_freq_list])

### Transforming to TF-IDF

In [61]:
def get_candidate_lists(freq_list):
    candidates = {}
    for row in freq_list:
        obj = row[1]
        k = row[0] + row[1]
        if k not in candidates:
            candidates[k] = []
        for ac in row[2:]:
            name = "_".join(ac.split("_")[:-1])
            candidates[k].append(name)
    return candidates

In [62]:
def count_tfidfs(ind, freq_list, candidates):

    aspect_candidates = freq_list[ind][2:]
    
    tfidfs = {}
    
    for ac in aspect_candidates:
        split_ac = ac.split("_")
        freq = int(split_ac[-1])
        name = "_".join(split_ac[:-1])
        
        tf = np.log(freq + 1)
        idf = 0
        for key, val in candidates.items():
            if name in val:
                idf += 1
        if idf == 0:
            print(name)
        idf = np.log(len(candidates) / idf)
        tfidfs[name] = tf * idf
    
    sorted_tfidfs = sorted(tfidfs.items(), key=lambda item: item[1], reverse=True)
    row = freq_list[ind][:2] + [(name + "_" + str(tfidf)) for name, tfidf in sorted_tfidfs]
    
    return row

In [None]:
start = 0

if start == 0:
    prop_tfidf_list = []
    val_tfidf_list = []

    
prop_candidates = get_candidate_lists(prop_freq_list)
val_candidates = get_candidate_lists(val_freq_list)

for i in range(len(prop_freq_list)):
    print(f"{i}. {prop_freq_list[i][1]}")
    
    prop_tfidfs = count_tfidfs(i, prop_freq_list,prop_candidates)
    val_tfidfs = count_tfidfs(i, val_freq_list, val_candidates)
    
    prop_tfidf_list.append(prop_tfidfs)
    val_tfidf_list.append(val_tfidfs)

In [67]:
save_csv('ES_prop_tfidfs.csv', prop_tfidf_list)
save_csv('ES_val_tfidfs.csv', val_tfidf_list)

### Templates

In [23]:
def get_template(obj_a, obj_b, mv, up_a, up_b):
    """
    Return basic template for comparing pair
    ---
    
    Parameters
    ---
    
    obj_a - name of object a.
    
    obj_b - name of object b.
    
    mv_names - a list of names of common properties
    with matching values.
    
    up_names_a - names of properties, unique for
    object a.
    
    up_names_b - names of properties, unique for
    object b.
    
    Returns
    ---
    
    template_text - str of template text
    """
    templ_beginning = "{} and {} are both {}.".format(obj_a, obj_b, ', '.join(mv[:3]))
    templ_ending = " However, {} is {}, when {} is {}.".format(obj_a, ', '.join(up_a[:3]), obj_b, ', '.join(up_b[:3]))
    template_text = templ_beginning + templ_ending
    return template_text

In [24]:
obj_a = "Python"
obj_b = "Java"
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
mv, nmv, up_a, up_b = compare_props(ent_info_a, ent_info_b, prop_filter=prop_filter)
print(get_template(obj_a, obj_b, mv, up_a, up_b))

Python and Java are both instance of, programming paradigm, influenced by. However, Python is programming language, operating system, movement, when Java is mascot, subclass of, described by source.
