In [1]:
from qwikidata.sparql import return_sparql_query_results
import pandas as pd
import numpy as np
import requests
from requests.auth import HTTPBasicAuth
import pickle

## Getting entities of desired objects

A function making a query to wikidata using SPARQL for getting a list of entities using a string as a name the desired item.

In [2]:
def search_entities(item, limit=10):
    query_string = f"""
    SELECT * WHERE {{
      ?item wdt:P31 ?instance
      SERVICE wikibase:mwapi {{
        bd:serviceParam wikibase:api "EntitySearch" .
        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
        bd:serviceParam mwapi:search "{item}" .
        bd:serviceParam mwapi:language "en" .
        bd:serviceParam mwapi:uselang "en" .
        bd:serviceParam mwapi:limit {limit} .
        ?item wikibase:apiOutputItem mwapi:item .

        ?num wikibase:apiOrdinal true.
      }}
    }} ORDER BY ASC (?num)
    """
    res = return_sparql_query_results(query_string)
    return res

In [3]:
res_a = search_entities("Python")
res_b = search_entities("Java")

This function transforms the response from the entity searching query to the dataframe which contains pairs of the following format: Entity - One of the classes of which this entity is an instance. One entity can be an instance of several classes, so some pairs have same entity.

In [4]:
def res2df(res):
    df = pd.DataFrame(columns=['entity', 'instance', 'num'])
    for row in res["results"]["bindings"]:
        entity = row["item"]["value"].split('/')[-1]
        instance = row["instance"]["value"].split('/')[-1]
        num = row["num"]["value"]
        df = df.append({'entity': entity, 'instance': instance, 'num': num}, ignore_index=True)
    return df

In [5]:
df_a = res2df(res_a)
df_b = res2df(res_b)

A function for extracting information about entities listed using a query to MediaWiki API. SPARQL does not give the full list of enitities for some reason. You cn check this by comparing the response received from it with web search page results at wikidata.org. Nevertheless, it can be helpful for getting a list of entities for a requested string.

In [6]:
def get_entities(entities):
    ids = ""
    for entity in entities:
        ids += entity + "|"
    ids = ids[:-1]
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&languages=en&format=json"
    response = requests.get(url).json()
    return response

Extracting the list of entities from json responce received by the previous function.

In [7]:
def get_insts_from_json(json):
    insts = []
    for key, val in json['entities'].items():
        insts_json = val['claims']['P31']
        inst = []
        for inst_json in insts_json:
            inst.append(inst_json['mainsnak']['datavalue']['value']['id'])
        insts.append(inst)
    return insts

A function for getting the supposedly compared entities by the principle of the majotity of coinciding "instances" (the classes of which this entity is an instance).

In [8]:
def get_best_pair(df_a, df_b):
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    insts_a = get_insts_from_json(json_a)
    insts_b = get_insts_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
#         inst_a = df_a[df_a['entity'] == ent_a[ind[0]]]['instance']
#         inst_b = df_b[df_b['entity'] == ent_b[ind[1]]]['instance']
        inst_a = insts_a[ind[0]]
        inst_b = insts_b[ind[1]]
        common_inst = list(set(inst_a) & set(inst_b))
        conc_table[ind] = len(common_inst)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    pair[0] = ent_a[pair[0]]
    pair[1] = ent_b[pair[1]]
    return pair

In [9]:
get_best_pair(df_a, df_b)

['Q28865', 'Q251']

Uniting the process of searching for entities, corresponding to the compared objects, using string names as input,  into one function. String names -> entity ids.

In [10]:
def strings2ids(obj_a, obj_b):
    a = search_entities(obj_a)
    b = search_entities(obj_b)
    a = res2df(a)
    b = res2df(b)
    ids = get_best_pair(a, b)
    return ids

In [11]:
strings2ids("python", "java")

['Q28865', 'Q251']

Let us try another approach based on coinciding pairs of property (entity relation) names.

In [12]:
def get_props_from_json(json):
    props = []
    for key, val in json['entities'].items():
        props_json = val['claims']
        prop = []
        for k in props_json.keys():
            prop.append(k)
        props.append(prop)
    return props

In [13]:
ent_a = df_a['entity'].unique()
json_a = get_entities(ent_a)
props_a = get_props_from_json(json_a)

In [14]:
def get_best_pair(df_a, df_b, criteria='property'):
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    if 'entities' not in json_a:
        if 'entities' not in json_b:
            print('No entities for objects a and b')
            return '', '', {'claims': {}}, {'claims': {}}
        else:
            print('No entities for object a')
            return '', ent_b[0], {'claims': {}}, json_b['entities'][ent_b[0]]
    elif 'entities' not in json_b:
        print('No entities for object b')
        return ent_a[0], '', json_a['entities'][ent_a[0]], {'claims': {}}
    if criteria == 'instance':
        cr_a = get_insts_from_json(json_a)
        cr_b = get_insts_from_json(json_b)
    elif criteria == 'property':
        cr_a = get_props_from_json(json_a)
        cr_b = get_props_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
#         inst_a = df_a[df_a['entity'] == ent_a[ind[0]]]['instance']
#         inst_b = df_b[df_b['entity'] == ent_b[ind[1]]]['instance']
        cur_cr_a = cr_a[ind[0]]
        cur_cr_b = cr_b[ind[1]]
        common_cr = list(set(cur_cr_a) & set(cur_cr_b))
        conc_table[ind] = len(common_cr)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    ent_info_a = json_a['entities'][ent_a[pair[0]]]
    ent_info_b = json_b['entities'][ent_b[pair[1]]]
    return ent_a[pair[0]], ent_b[pair[1]], ent_info_a, ent_info_b

If we look at the concurrernce tables (conc_table variable), we can see that in case of comparing instance the numbers are lower than in case of comparing properties, so outliers are more likely to mess up the choice of entity pair.  

Conc_table for instances:  
[[2. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]  
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]
   
Conc_table for properties:  
[[28.  9.  4. 15.  2.  3.  2.  2.  3.]  
 [ 4.  6.  2.  4.  2.  3. 12.  2.  4.]  
 [ 8. 13.  3.  5.  2.  4.  3.  2.  4.]  
 [ 3.  3.  4.  2.  1.  2.  2.  1.  2.]  
 [ 9.  3.  2. 11.  2.  2.  2.  2.  3.]  
 [ 4.  5.  2.  5.  2.  3.  3.  2.  4.]  
 [ 1.  2.  1.  1.  2.  1.  1.  1.  1.]  
 [ 2.  1.  1.  2.  1.  1.  1.  1.  1.]  
 [ 3.  3.  3.  3.  1.  3.  2.  1.  2.]  
 [ 2.  1.  1.  1.  1.  1.  1.  1.  1.]]  
  


In [15]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(df_a, df_b, criteria='property')
ent_id_a, ent_id_b

('Q28865', 'Q251')

In [16]:
def strings2ids(obj_a, obj_b, criteria='property'):
    a = search_entities(obj_a)
    b = search_entities(obj_b)
    a = res2df(a)
    b = res2df(b)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(a, b, criteria=criteria)
    return ent_id_a, ent_id_b, ent_info_a, ent_info_b

In [17]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids("python", "java")
ent_id_a, ent_id_b

('Q28865', 'Q251')

## Retreiving aspects???

Based on coinciding properties.

In [43]:
def get_prop_names(ids, json):
    prop_names = []
    for prop_id in ids:
        p = json['entities'][prop_id]['labels']['en']['value'].lower()
#         if  not p.endswith("ID"):
        if " id" not in p:
            prop_names.append(p)
    return prop_names

In [44]:
def compare_props(ent_info_a, ent_info_b):
    props_a = []
    for k in ent_info_a['claims'].keys():
        props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        props_b.append(k)
    
    common_props = list(set(props_a) & set(props_b))
    uncommon_props = list(set(props_a) ^ set(props_b))
    
    a_json = get_entities(common_props[:50])
    b_json = get_entities(uncommon_props[:50])
    cp_names = get_prop_names(common_props[:50], a_json)
    up_names = get_prop_names(uncommon_props[:50], b_json)
    return cp_names, up_names

In [45]:
cp, up = compare_props(ent_info_a, ent_info_b)

In [46]:
cp

['signature',
 'languages spoken, written or signed',
 'owner of',
 'employer',
 'position held',
 'child',
 'commons gallery',
 'place of birth',
 'name in native language',
 'spouse',
 'mother',
 'image',
 'sex or gender',
 'country of citizenship',
 'religion',
 'present in work',
 "topic's main category",
 'occupation',
 'described by source',
 'date of birth',
 'libris-uri',
 'residence',
 'member of']

In [47]:
up

['lifestyle',
 'cause of death',
 'net worth',
 'medical condition',
 'work period (start)',
 'place of death',
 'height',
 'instagram username',
 'relative',
 'manner of death',
 'sport',
 'name in kana',
 'giphy username',
 'sibling',
 'official website',
 'twitter username',
 'last words',
 'erdős number']

In [48]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids("Bill Gates", "Steve Jobs")
cp, up = compare_props(ent_info_a, ent_info_b)

In [49]:
cp

['signature',
 'languages spoken, written or signed',
 'owner of',
 'employer',
 'position held',
 'child',
 'commons gallery',
 'place of birth',
 'name in native language',
 'spouse',
 'mother',
 'image',
 'sex or gender',
 'country of citizenship',
 'religion',
 'present in work',
 "topic's main category",
 'occupation',
 'described by source',
 'date of birth',
 'libris-uri',
 'residence',
 'member of']

In [50]:
up

['lifestyle',
 'cause of death',
 'net worth',
 'medical condition',
 'work period (start)',
 'place of death',
 'height',
 'instagram username',
 'relative',
 'manner of death',
 'sport',
 'name in kana',
 'giphy username',
 'sibling',
 'official website',
 'twitter username',
 'last words',
 'erdős number']

### Check retreiving aspects on ACQuA dataset

In [52]:
acqua = pd.read_csv("ACQuA - CompArgs - Final Evaluation Triples CAM - v1_triples.tsv", delimiter='\t', index_col='index')
acqua.head()

Unnamed: 0_level_0,objectA,objectB,aspects,label,triple results,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,London,Paris,Population,BETTER,309,The city with the higher population is assumed...
2,Watermelon,Apple,sugar,BETTER,113,The fruite with less sugar is better.
3,A380,777-300ER,seats,BETTER,22,The plane with more seats is better.
4,coca-cola,orange juice,sugar,WORSE,30,The beverage with less sugar is better.
5,ruby,php,performance,WORSE,113,The language providing better performance on e...


In [53]:
acqua.shape

(40, 6)

In [54]:
# get_entities(res2df(search_entities("777-300ER"))['entity'].unique())

In [55]:
found_in_common = []
found_in_uncommon = []
found_total = []

for index, row in acqua.iterrows():
    obj_a = row['objectA']
    obj_b = row['objectB']
    aspects= row['aspects'].split(", ")
    print(f"\n{obj_a}, {obj_b}, {aspects}")
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    cp, up = compare_props(ent_info_a, ent_info_b)
    found_in_common.append(0)
    found_in_uncommon.append(0)
    found_total.append(0)
    for aspect in aspects:
        if aspect in cp:
            found_in_common[-1] += 1
        if aspect in up:
            found_in_uncommon[-1] += 1
        found_total[-1] = found_in_common[-1] + found_in_uncommon[-1]


London, Paris, ['Population']

Watermelon, Apple, ['sugar']

A380, 777-300ER, ['seats']
No entities for object b

coca-cola, orange juice, ['sugar']

ruby, php, ['performance']

erlang, java, ['performance']

induction, gas, ['boil']

android, ios, ['app quality']

yale, harvard, ['endowment']

hdmi, dvi, ['image quality']

pakistan, india, ['poverty']

twitter, facebook, ['virality']

vhs, betamax, ['picture quality']

japan, china, ['air pollution']

nickel, copper, ['melting point']

steel, titanium, ['melting point']

stone, wood, ['insulation']

wav, mp3, ['sound quality']

copper, bronze, ['harder']

cast iron, steel, ['conductivity']

running, cycling, ['calories']

earth, uranus, ['mass']

granite, marble, ['durable']

cat, dog, ['cost']

lead, silver, ['density']

rfid, nfc, ['range']

mexico, argentina, ['size']

mp3, wma, ['compression']

ccd, cmos, ['power']

fat32, ntfs, ['security']

chicken, turkey, ['protein']

raven, crow, ['size']

ntsc, pal, ['bandwidth']

dsl, cabl

In [56]:
results = pd.DataFrame(acqua[['objectA', 'objectB', 'aspects']])
results['found_in_common'] = found_in_common
results['found_in_uncommon'] = found_in_uncommon
results['found_total'] = found_total
results

Unnamed: 0_level_0,objectA,objectB,aspects,found_in_common,found_in_uncommon,found_total
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,London,Paris,Population,0,0,0
2,Watermelon,Apple,sugar,0,0,0
3,A380,777-300ER,seats,0,0,0
4,coca-cola,orange juice,sugar,0,0,0
5,ruby,php,performance,0,0,0
6,erlang,java,performance,0,0,0
7,induction,gas,boil,0,0,0
8,android,ios,app quality,0,0,0
9,yale,harvard,endowment,0,1,1
10,hdmi,dvi,image quality,0,0,0


### Check retreiving aspects on Elasticsearch

In [115]:
def request_elasticsearch(obj_a, obj_b=None, user='reader', password='reader'):
    url = 'http://ltdemos.informatik.uni-hamburg.de/depcc-index/_search?q='
    if obj_b is None:
        url += 'text:\"{}\"'.format(obj_a)
    else:
        url += 'text:\"{}\"%20AND%20\"{}\"'.format(obj_a, obj_b)

    size = 10000
    
    url += '&from=0&size={}'.format(size)
    response = requests.get(url, auth=HTTPBasicAuth(user, password))
    return response

In [121]:
json_compl = request_elasticsearch('python')

In [122]:
def extract_sentences(es_json, aggregate_duplicates=False):
    try:
        hits = es_json.json()['hits']['hits']
    except KeyError:
        return []
    seen_sentences = []
    for hit in hits:
        text = hit['_source']['text']
        if aggregate_duplicates:
            seen_sentences.append(text.lower())
        else:
            if text.lower() not in seen_sentences:
                seen_sentences.append(text.lower())

    return seen_sentences

In [123]:
all_sentences = extract_sentences(json_compl)

In [124]:
len(all_sentences)

6125

In [97]:
pairs = pd.DataFrame(pd.read_csv("Compared_pairs.tsv", delimiter='\t')[['object_a', 'object_b']])
pairs.head()

Unnamed: 0,object_a,object_b
0,JavaScript,Perl
1,Windows 8,Windows 7
2,Scala,Java
3,Swift,Objective-C
4,PHP,Java


In [102]:
commons = []
uncommons = []
sent_nums = []

for index, row in pairs.iterrows():
    obj_a = row['object_a']
    obj_b = row['object_b']
    print(f"\n{index}: {obj_a}, {obj_b}")
    json_compl = request_elasticsearch(obj_a, obj_b, name, password)
    all_sentences = extract_sentences(json_compl)
    sent_nums.append(len(all_sentences))
    text = " ".join(all_sentences)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    cp, up = compare_props(ent_info_a, ent_info_b)
    common = {}
    uncommon = {}
    for asp in cp:
        common[asp] = text.count(asp)
    for asp in up:
        uncommon[asp] = text.count(asp)
    commons.append(common)
    uncommons.append(uncommon)


0: JavaScript, Perl

1: Windows 8, Windows 7

2: Scala, Java

3: Swift, Objective-C

4: PHP, Java

5: Apple TV, iOS

6: Ruby, Python

7: Java, Perl

8: Python, Java

9: Windows XP, Windows 98

10: JavaScript, PHP

11: Ethernet, USB

12: PostgreSQL, Oracle

13: MySQL, SQLite

14: Smalltalk, Java

15: Haskell, Java

16: Groovy, Java

17: Java, Lisp

18: PHP, Python

19: Python, Bash

20: Python, Perl

21: Eclipse, NetBeans

22: CUDA, OpenCL

23: MySQL, Oracle

24: Python, Lisp

25: Java, Scala

26: Ruby, PHP

27: JavaScript, Python

28: Ruby, Java

29: Sed, AWK

30: Windows Vista, Windows XP

31: Integer, Floating-point

32: OpenGL, Direct3D

33: Chrysler, Toyota

34: Toyota, Honda

35: Leica, Nikon

36: Nissan, Honda

37: Amazon, Microsoft

38: Apple, Google

39: Honda, Toyota

40: Nvidia, MSI

41: IBM, Hewlett-Packard

42: Apple, Microsoft

43: Apple, IBM

44: Nissan, Renault

45: Nokia, Motorola

46: Nikon, Sigma

47: Microsoft, IBM

48: Apple, Sony

49: Buick, Cadillac

50: Advil, M

In [103]:
pickle_out = open('commons.pickle', "wb")
pickle.dump(commons, pickle_out)
pickle_out.close()

pickle_out = open('uncommons.pickle', "wb")
pickle.dump(uncommons, pickle_out)
pickle_out.close()

pickle_out = open('sent_nums.pickle', "wb")
pickle.dump(sent_nums, pickle_out)
pickle_out.close()

In [None]:
with open('commons.pickle', "rb") as pickle_in:
    commons = pickle.load(pickle_in)

with open('uncommons.pickle', "rb") as pickle_in:
    uncommons = pickle.load(pickle_in)

with open('sent_nums.pickle', "rb") as pickle_in:
    sent_nums = pickle.load(pickle_in)

In [104]:
for index, row in pairs.iterrows():
    obj_a = row['object_a']
    obj_b = row['object_b']
    print(f"\n{index}: {obj_a}, {obj_b}")
    print(commons[index])
    print(uncommons[index])
    print('-' * 30)


0: JavaScript, Perl
{'designed by': 0, 'influenced by': 2, 'commons category': 0, 'omegawiki defined meaning': 0, 'media type': 0, 'instance of': 5, 'developer': 87, 'stack exchange tag': 0, "topic's main category": 0, 'inception': 0, 'subreddit': 0}
{'logo image': 0, 'official website': 0, 'free software directory entry': 0, 'creator': 0, 'wolfram language entity code': 0, 'ubuntu package': 0, 'use': 488, 'named after': 0, 'aur package': 0, 'arch linux package': 0, 'wikimedia outline': 0, 'typing discipline': 0, 'file extension': 0, 'copyright license': 0, 'opensuse package': 0, 'mascot': 0, 'different from': 1, "topic's main template": 0, 'gentoo package': 0, 'owned by': 0, 'based on': 16, 'programming paradigm': 0, 'native label': 0, 'spoken text audio': 0, 'freebsd port': 0, 'programming language': 174, 'debian stable package': 0, 'fedora package': 0, 'source code repository': 0, 'repology project name': 0, 'icon': 0, 'operating system': 5}
------------------------------

1: Windo

In [120]:
def prop_names(ent_info_a, ent_info_b):
    props_a = []
    for k in ent_info_a['claims'].keys():
        props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        props_b.append(k)
    
    a_json = get_entities(props_a[:50])
    b_json = get_entities(props_b[:50])
    a_names = get_prop_names(props_a[:50], a_json)
    b_names = get_prop_names(props_b[:50], b_json)
    return a_names, b_names

In [129]:
counts_a = []
counts_b = []
sent_nums_a = []
sent_nums_b = []

for index, row in pairs.iterrows():
    obj_a = row['object_a']
    obj_b = row['object_b']
    print(f"\n{index}: {obj_a}, {obj_b}")
    json_compl_a = request_elasticsearch(obj_a)
    json_compl_b = request_elasticsearch(obj_b)
    all_sentences_a = extract_sentences(json_compl_a)
    all_sentences_b = extract_sentences(json_compl_b)
    sent_nums_a.append(len(all_sentences_a))
    sent_nums_b.append(len(all_sentences_b))
    text_a = " ".join(all_sentences_a)
    text_b = " ".join(all_sentences_b)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)
    a_props, b_props = prop_names(ent_info_a, ent_info_b)
    count_a = {}
    count_b = {}
    for asp in a_props:
        count_a[asp] = text_a.count(asp)
    for asp in b_props:
        count_b[asp] = text_b.count(asp)
    counts_a.append(count_a)
    counts_b.append(count_b)


0: JavaScript, Perl

1: Windows 8, Windows 7

2: Scala, Java

3: Swift, Objective-C

4: PHP, Java

5: Apple TV, iOS

6: Ruby, Python

7: Java, Perl

8: Python, Java

9: Windows XP, Windows 98

10: JavaScript, PHP

11: Ethernet, USB

12: PostgreSQL, Oracle

13: MySQL, SQLite

14: Smalltalk, Java

15: Haskell, Java

16: Groovy, Java

17: Java, Lisp

18: PHP, Python

19: Python, Bash

20: Python, Perl

21: Eclipse, NetBeans

22: CUDA, OpenCL

23: MySQL, Oracle

24: Python, Lisp

25: Java, Scala

26: Ruby, PHP

27: JavaScript, Python

28: Ruby, Java

29: Sed, AWK

30: Windows Vista, Windows XP

31: Integer, Floating-point

32: OpenGL, Direct3D

33: Chrysler, Toyota

34: Toyota, Honda

35: Leica, Nikon

36: Nissan, Honda

37: Amazon, Microsoft

38: Apple, Google

39: Honda, Toyota

40: Nvidia, MSI

41: IBM, Hewlett-Packard

42: Apple, Microsoft

43: Apple, IBM

44: Nissan, Renault

45: Nokia, Motorola

46: Nikon, Sigma

47: Microsoft, IBM

48: Apple, Sony

49: Buick, Cadillac

50: Advil, M

In [130]:
pickle_out = open('counts_a.pickle', "wb")
pickle.dump(counts_a, pickle_out)
pickle_out.close()

pickle_out = open('counts_b.pickle', "wb")
pickle.dump(counts_b, pickle_out)
pickle_out.close()

pickle_out = open('sent_nums_a.pickle', "wb")
pickle.dump(sent_nums_a, pickle_out)
pickle_out.close()

pickle_out = open('sent_nums_b.pickle', "wb")
pickle.dump(sent_nums_b, pickle_out)
pickle_out.close()

In [131]:
for index, row in pairs.iterrows():
    obj_a = row['object_a']
    obj_b = row['object_b']
    print(f"\n{index}: {obj_a}, {obj_b}")
    print(f"{obj_a}:")
    print(counts_a[index])
    print(f"{obj_b}:")
    print(counts_b[index])
    print('-' * 30)


0: JavaScript, Perl
JavaScript:
{'stack exchange tag': 0, 'instance of': 0, 'influenced by': 0, 'designed by': 0, 'commons category': 0, "topic's main category": 0, 'media type': 2, 'owned by': 0, 'inception': 0, 'based on': 4, "topic's main template": 0, 'developer': 67, 'subreddit': 0, 'use': 271, 'programming paradigm': 0, 'different from': 1, 'icon': 10, 'omegawiki defined meaning': 0, 'named after': 0, 'wolfram language entity code': 0, 'typing discipline': 0, 'spoken text audio': 0, 'logo image': 0}
Perl:
{'file extension': 1, 'instance of': 0, 'commons category': 0, "topic's main category": 0, 'official website': 0, 'developer': 36, 'stack exchange tag': 0, 'mascot': 0, 'influenced by': 0, 'programming language': 9, 'copyright license': 0, 'inception': 0, 'creator': 3, 'source code repository': 0, 'free software directory entry': 0, 'designed by': 0, 'debian stable package': 0, 'ubuntu package': 0, 'fedora package': 0, 'arch linux package': 0, 'gentoo package': 0, 'wikimedia ou

## Dividing uncommon aspects into uncommon aspects for object A and object B separately

Recap of the previous functions in one cell

In [56]:
def search_entities(item, limit=10, n_retries=10):
    item = item.lower()
    query_string = f"""
    SELECT * WHERE {{
      ?item wdt:P31 ?instance
      SERVICE wikibase:mwapi {{
        bd:serviceParam wikibase:api "EntitySearch" .
        bd:serviceParam wikibase:endpoint "www.wikidata.org" .
        bd:serviceParam mwapi:search "{item}" .
        bd:serviceParam mwapi:language "en" .
        bd:serviceParam mwapi:uselang "en" .
        bd:serviceParam mwapi:limit {limit} .
        ?item wikibase:apiOutputItem mwapi:item .

        ?num wikibase:apiOrdinal true.
      }}
    }} ORDER BY ASC (?num)
    """
    for retry in range(n_retries + 1):
        if retry > 0:
            print(f"retrying: attempt {retry}")
        res = requests.get("https://query.wikidata.org/sparql", params={"query": query_string, "format": "json"})
        if res.status_code != 200:
            print(f"Response {res.status_code} for {item}.")
            if retry == n_retries:
                return {'results': {'bindings': []}}
            continue
        res = res.json()
        if not res["results"]["bindings"]:
            print(f"Zero entities for {item}.")
            continue
        if retry > 0:
            print("ok")
        break
    return res

def res2df(res):
    df = pd.DataFrame(columns=['entity', 'instance', 'num'])
    for row in res["results"]["bindings"]:
        entity = row["item"]["value"].split('/')[-1]
        instance = row["instance"]["value"].split('/')[-1]
        num = row["num"]["value"]
        df = df.append({'entity': entity, 'instance': instance, 'num': num}, ignore_index=True)
    return df

def get_entities(entities):
    ids = ""
    for entity in entities:
        ids += entity + "|"
    ids = ids[:-1]
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids}&languages=en&format=json"
    response = requests.get(url).json()
    return response

def get_insts_from_json(json):
    insts = []
    for key, val in json['entities'].items():
        insts_json = val['claims']['P31']
        inst = []
        for inst_json in insts_json:
            inst.append(inst_json['mainsnak']['datavalue']['value']['id'])
        insts.append(inst)
    return insts

def get_props_from_json(json):
    props = []
    for key, val in json['entities'].items():
        props_json = val['claims']
        prop = []
        for k in props_json.keys():
            prop.append(k)
        props.append(prop)
    return props

def get_best_pair(df_a, df_b, criteria='property'):
    ent_a = df_a['entity'].unique()
    ent_b = df_b['entity'].unique()
    json_a = get_entities(ent_a)
    json_b = get_entities(ent_b)
    if 'entities' not in json_a:
        if 'entities' not in json_b:
            print('No entities for objects a and b')
            return '', '', {'claims': {}}, {'claims': {}}
        else:
            print('No entities for object a')
            return '', ent_b[0], {'claims': {}}, json_b['entities'][ent_b[0]]
    elif 'entities' not in json_b:
        print('No entities for object b')
        return ent_a[0], '', json_a['entities'][ent_a[0]], {'claims': {}}
    if criteria == 'instance':
        cr_a = get_insts_from_json(json_a)
        cr_b = get_insts_from_json(json_b)
    elif criteria == 'property':
        cr_a = get_props_from_json(json_a)
        cr_b = get_props_from_json(json_b)
    
    conc_table = np.zeros((ent_a.shape[0], ent_b.shape[0]))
    for ind, val in np.ndenumerate(conc_table):
        cur_cr_a = cr_a[ind[0]]
        cur_cr_b = cr_b[ind[1]]
        common_cr = list(set(cur_cr_a) & set(cur_cr_b))
        conc_table[ind] = len(common_cr)
    pair = list(np.unravel_index(np.argmax(conc_table), conc_table.shape))
    ent_info_a = json_a['entities'][ent_a[pair[0]]]
    ent_info_b = json_b['entities'][ent_b[pair[1]]]
    return ent_a[pair[0]], ent_b[pair[1]], ent_info_a, ent_info_b

def strings2ids(obj_a, obj_b, criteria='property', n_retries=10):
    a = search_entities(obj_a)
    if not a["results"]["bindings"]:
        print(f"zero entities for {obj_a}: retrying")
        for retry in range(n_retries):
            print(f"Attempt {retry + 1}")
            a = search_entities(obj_a)
            if a["results"]["bindings"]:
                print("ok")
                break
    b = search_entities(obj_b)
    if not b["results"]["bindings"]:
        print(f"zero entities for {obj_b}: retrying")
        for retry in range(n_retries):
            print(f"Attempt {retry + 1}")
            b = search_entities(obj_b)
            if b["results"]["bindings"]:
                print("ok")
                break
    a = res2df(a)
    b = res2df(b)
    ent_id_a, ent_id_b, ent_info_a, ent_info_b = get_best_pair(a, b, criteria=criteria)
    return ent_id_a, ent_id_b, ent_info_a, ent_info_b

def get_prop_names(ids, json):
    prop_names = []
    for prop_id in ids:
        p = json['entities'][prop_id]['labels']['en']['value'].lower()
        if " id" not in p:
            prop_names.append(p)
    return prop_names

In [94]:
res = search_entities("Python")

Zero entities for python.
retrying: attempt 1
ok


Uncommon props -> original for A, original for B

In [3]:
def compare_props(ent_info_a, ent_info_b, prop_filter=lambda p: True):
    props_a = []
    for k in ent_info_a['claims'].keys():
        if prop_filter(ent_info_a['claims'][k]):
            props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        if prop_filter(ent_info_b['claims'][k]):
            props_b.append(k)
    
    common_props = list(set(props_a) & set(props_b))
    uncommon_props_a = list(set(props_a) - set(common_props))
    uncommon_props_b = list(set(props_b) - set(common_props))
    
    cp_json = get_entities(common_props[:50])
    up_a_json = get_entities(uncommon_props_a[:50])
    up_b_json = get_entities(uncommon_props_b[:50])
    cp_names = get_prop_names(common_props[:50], cp_json)
    up_names_a = get_prop_names(uncommon_props_a[:50], up_a_json)
    up_names_b = get_prop_names(uncommon_props_b[:50], up_b_json)
    return cp_names, up_names_a, up_names_b

In [56]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids("Bill Gates", "Steve Jobs")
cp, up_a, up_b = compare_props(ent_info_a, ent_info_b)

In [57]:
cp

['described by source',
 'spouse',
 'sex or gender',
 'commons category',
 'commons gallery',
 'name in native language',
 'languages spoken, written or signed',
 'mother',
 'given name',
 'owner of',
 "topic's main category",
 'position held',
 'residence',
 'child',
 'birth name',
 'present in work',
 'educated at',
 'father',
 'signature',
 'place of birth',
 'ethnic group']

In [58]:
up_a

['instagram username',
 'twitter username',
 'handedness',
 'writing language',
 'ria novosti reference',
 'vehicle normally used',
 'work period (start)',
 'british museum person-institution',
 'reddit username',
 'name in kana',
 'height',
 'official website',
 'sport',
 'net worth',
 'erdős number']

In [59]:
up_b

['place of burial',
 'medical condition',
 'cause of death',
 'giphy username',
 'last words',
 'place of death',
 'date of death',
 'unmarried partner',
 'lifestyle',
 'relative',
 'member of political party',
 'sibling',
 'manner of death']

## Filtering
Currently filtering is done only using the "id" substring in the names of properties.  
A better way is to filter properties by their type. For example, id properties have the type "external-id" or "string". I had not found any aspect-like properties of type "string", so I decided to filter all the properties of this type out. Other property types can be found here:  
https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all

In [4]:
# Filtering removed from this function
def get_prop_names(ids, json):
    prop_names = []
    for prop_id in ids:
        p = json['entities'][prop_id]['labels']['en']['value'].lower()
        prop_names.append(p)
    return prop_names

# Filtering function
def prop_filter(prop):
    bad_datatypes = [
        "external-id",
        "commonsMedia",
        "url",
        "globe-coordinate",
        "wikibase-sense",
        "wikibase-property",
        "wikibase-lexeme",
        "string"
    ]
    datatype = prop[0]['mainsnak']['datatype']
    if datatype in bad_datatypes:
        return False
    return True

In [62]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids("Bill Gates", "Steve Jobs")

Results without filtering

In [63]:
cp, up_a, up_b = compare_props(ent_info_a, ent_info_b)

In [64]:
cp

['described by source',
 'nkcr aut id',
 'library of congress authority id',
 'quora topic id',
 'spouse',
 'encyclopædia universalis id',
 'cinii author id (books)',
 'sex or gender',
 'geni.com profile id',
 'comic vine id',
 'national diet library id',
 'commons category',
 'open library id',
 'gtaa id',
 'commons gallery',
 'name in native language',
 'languages spoken, written or signed',
 'bibliothèque nationale de france id',
 'conor id',
 'mother',
 'selibr id',
 'spanish vikidia id',
 'given name',
 'cultureel woordenboek id',
 'owner of',
 'freebase id',
 'nationale thesaurus voor auteurs id',
 'snac ark id',
 'nukat id',
 "topic's main category",
 'position held',
 'imdb id',
 'residence',
 'child',
 'birth name',
 'biblioteca nacional de españa id',
 'viaf id',
 'the times of india topic id',
 'wikitree person id',
 'present in work',
 'educated at',
 'idref id',
 'father',
 'libraries australia id',
 'giant bomb id',
 'munzinger person id',
 'international standard name id

In [65]:
up_a

['instagram username',
 'allociné person id',
 'twitter username',
 'dblp id',
 'openmlol author id',
 'handedness',
 'national library of russia id',
 'writing language',
 'ria novosti reference',
 'vehicle normally used',
 'port person id',
 'work period (start)',
 'kinopoisk person id',
 'linkedin personal profile id',
 'littlesis people id',
 'youtube channel id',
 'klexikon article id',
 'british museum person-institution',
 'čsfd person id',
 'new york times topic id',
 'reddit username',
 'great russian encyclopedia online id',
 'wsj topic id',
 'egaxa id',
 'isfdb author id',
 'name in kana',
 'c-span person id',
 'sina weibo user id',
 'facebook id',
 'height',
 'politifact people and groups id',
 'bnb person id',
 'official website',
 'dagens nyheter topic id',
 'swedish film database person id',
 'sport',
 'net worth',
 'genealogics.org person id',
 'erdős number',
 'abart person id',
 'national portrait gallery (london) person id',
 'share catalogue author id',
 'university

In [66]:
up_b

['museum of modern art artist id',
 'ne.se id',
 'english vikidia id',
 'know your meme id',
 'place of burial',
 'medical condition',
 'pm20 folder id',
 'cause of death',
 'giphy username',
 'last words',
 'place of death',
 'date of death',
 'unmarried partner',
 'worldcat identities id',
 'crunchbase person id',
 'lifestyle',
 'disney a to z id',
 'sbn author id',
 'relative',
 'member of political party',
 'gran enciclopèdia catalana id',
 'cantic id',
 'sibling',
 'justia patents inventor id',
 'portuguese national library id',
 'national inventors hall of fame id',
 'bna authority id',
 'treccani id',
 'find a grave memorial id',
 'manner of death']

Results after filtering

In [72]:
cp, up_a, up_b = compare_props(ent_info_a, ent_info_b, prop_filter=prop_filter)

In [73]:
cp

['described by source',
 'given name',
 'present in work',
 'owner of',
 'instance of',
 'spouse',
 'educated at',
 "topic's main category",
 'sex or gender',
 'award received',
 'position held',
 'father',
 'occupation',
 'family name',
 'residence',
 'child',
 'birth name',
 'place of birth',
 'ethnic group',
 'name in native language',
 'country of citizenship',
 'languages spoken, written or signed',
 'member of',
 'employer',
 'date of birth',
 'religion',
 'mother']

In [74]:
up_a

['handedness',
 'sport',
 'net worth',
 'writing language',
 'erdős number',
 'height',
 'vehicle normally used',
 'work period (start)']

In [75]:
up_b

['relative',
 'member of political party',
 'last words',
 'place of death',
 'sibling',
 'place of burial',
 'unmarried partner',
 'medical condition',
 'date of death',
 'manner of death',
 'lifestyle',
 'cause of death']

## Further aspect splitting:
- a list of aspects with matching names + matching values
- a list of aspects with matching names + non-matching values
- for each object: a list of aspects which don’t exist for the other object

common properties -> matching values, non-matching values

In [5]:
def get_values(ent_info, prop_name):
    vals = []
    for val in ent_info['claims'][prop_name]:
        datatype = val['mainsnak']['datatype']
        if datatype == 'wikibase-item':
            vals.append(val['mainsnak']['datavalue']['value']['id'])
        # ignore time datatype
    return vals

def compare_props(ent_info_a, ent_info_b, prop_filter=lambda p: True):
    props_a = []
    for k in ent_info_a['claims'].keys():
        if prop_filter(ent_info_a['claims'][k]):
            props_a.append(k)
        
    props_b = []
    for k in ent_info_b['claims'].keys():
        if prop_filter(ent_info_b['claims'][k]):
            props_b.append(k)
    
    common_props = list(set(props_a) & set(props_b))
    m_vals = []
    nm_vals = []
    for common_prop in common_props:
        vals_a = get_values(ent_info_a, common_prop)
        vals_b = get_values(ent_info_b, common_prop)
        if len(set(vals_a) & set(vals_b)) > 0:
            m_vals.append(common_prop)
        else:
            nm_vals.append(common_prop)
    uncommon_props_a = list(set(props_a) - set(common_props))
    uncommon_props_b = list(set(props_b) - set(common_props))
    
    mv_json = get_entities(m_vals[:50])
    nmv_json = get_entities(nm_vals[:50])
    up_a_json = get_entities(uncommon_props_a[:50])
    up_b_json = get_entities(uncommon_props_b[:50])
    mv_names = get_prop_names(m_vals[:50], mv_json)
    nmv_names = get_prop_names(nm_vals[:50], nmv_json)
    up_names_a = get_prop_names(uncommon_props_a[:50], up_a_json)
    up_names_b = get_prop_names(uncommon_props_b[:50], up_b_json)
    return mv_names, nmv_names, up_names_a, up_names_b

In [101]:
obj_a = "Python"
obj_b = "Java"

In [104]:
ent_id_a, ent_id_b, ent_info_a, ent_info_b = strings2ids(obj_a, obj_b)

In [105]:
mv, nmv, up_a, up_b = compare_props(ent_info_a, ent_info_b, prop_filter=prop_filter)

In [106]:
mv

['influenced by', 'programming paradigm', 'instance of']

In [107]:
nmv

['inception',
 'copyright license',
 'developer',
 "topic's main category",
 'typing discipline',
 'designed by',
 'different from',
 'named after']

In [108]:
up_a

["topic's main template",
 'has part',
 'history of topic',
 'operating system',
 'movement',
 'use',
 'programming language']

In [109]:
up_b

['has quality',
 'described by source',
 'country of origin',
 'subclass of',
 'part of',
 'mascot']

Printing the results of aspect search as one string:

In [118]:
def print_findings(obj_a, obj_b, mv, nmv, up_a, up_b):
    print(f"{'-' * 50}\n{obj_a} vs {obj_b}\n{'-' * 50}")
    print(f"Common properties with matching values:\n{', '.join(mv)}.\n")
    print(f"Common properties with non-matching values:\n{', '.join(nmv)}.\n")
    print(f"Original properties for {obj_a}:\n{', '.join(up_a)}.\n")
    print(f"Original properties for {obj_b}:\n{', '.join(up_b)}.\n")

In [119]:
print_findings(obj_a, obj_b, mv, nmv, up_a, up_b)

--------------------------------------------------
Python vs Java
--------------------------------------------------
Common properties with matching values:
influenced by, programming paradigm, instance of.

Common properties with non-matching values:
inception, copyright license, developer, topic's main category, typing discipline, designed by, different from, named after.

Original properties for Python:
topic's main template, has part, history of topic, operating system, movement, use, programming language.

Original properties for Java:
has quality, described by source, country of origin, subclass of, part of, mascot.



# Ranking of Wikidata aspects

In [191]:
import json

with open('mined_bow_str.json', 'r') as f:
    contents = f.readlines()

data = []
for line in contents:
    data.append(json.loads(line))

In [192]:
len(data)

539

In [198]:
num = 192

data[num]['object1']['name'], data[num]['object2']['name']

('chevrolet', 'ford')

## Using Elasticsearch

In [95]:
def request_elasticsearch(objects, user='reader', password='reader'):
    url = 'http://ltdemos.informatik.uni-hamburg.de/depcc-index/_search?q=text:\"'
    url += '\"%20AND%20\"'.join(objects)

    size = 10000
    
    url += '\"&from=0&size={}'.format(size)
    response = requests.get(url, auth=HTTPBasicAuth(user, password))
    return response

In [97]:
request_elasticsearch(["python", "java"]).json()

{'took': 41031,
 'timed_out': False,
 '_shards': {'total': 58, 'successful': 58, 'skipped': 0, 'failed': 0},
 'hits': {'total': 316150,
  'max_score': 25.914246,
  'hits': [{'_index': 'depcc',
    '_type': 'text',
    '_id': 'S-3-DmcBYGo7kjl35x1m',
    '_score': 25.914246,
    '_source': {'sentence_hash': 419696394,
     'document_id': 'http://www.files32.com/Solar-Cells.asp',
     'insert_id': '',
     'text': 'Software Terms:\xa0 Java, Programming, Source Code, Java Programming, java tutorial, Python, python tutorial, python examples, java examples, python help .',
     'sentence_id': 80}},
   {'_index': 'depcc',
    '_type': 'text',
    '_id': '_FrOFGcBfM0fMjkQuh5d',
    '_score': 25.905891,
    '_source': {'sentence_hash': 419696394,
     'document_id': 'http://www.files32.com/Solar-Cells.asp',
     'insert_id': '',
     'text': 'Software Terms:\xa0 Java, Programming, Source Code, Java Programming, java tutorial, Python, python tutorial, python examples, java examples, python help 

### Aspect NAMES for a pair of objects