#### 1. Importing Libraries

In [1]:
import json
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
import json
import time

#### 2. Reading Monument List

In [3]:
monument_list = []
for i in range(1,120):
    with open('../Data/Wikidata_JSON/en_monument_dump_part'+str(i)+'.json') as f:
        new_list = json.load(f)
        monument_list = monument_list + new_list

monument_dict = {}
for monument in monument_list:
    monument_dict[monument['id']] = monument
             
wikipage_monument_dict = {}
for i in range(1,58):
    with open('../Data/English_Wikipages/en_articles_part'+str(i)+'.json') as f:
        new_dict = json.load(f)
        wikipage_monument_dict.update(new_dict) 

#### 3. Retrieving Labels for Property Value pairs for each monument

In [25]:
labelled_monument_list = {}
label_list = {}
non_labelled_props = ['P727']

t0 = time.time()

index = 0

count_monuments = 1420

for monument_id in list(wikipage_monument_dict.keys())[1420:]:
    
    monument = monument_dict[monument_id]
    
    monument_labelled_prop_val = {}
    list_prop_value = monument['claims']
    list_properties = list(list_prop_value.keys())
    
    #Removing Properties from list of properties which dont have a wikidata page
    for prop in non_labelled_props:
        if prop in list_properties:
            list_properties.remove(prop)
    
    list_properties_copy = list_properties
    
    #Adding all properties to label list. The ones which dont have wikidata pages are stored in non_labelled_props 
    for prop in list_properties:
        if prop not in label_list.keys():
            try:
                prop_details = get_entity_dict_from_api(prop)
                prop_label = prop_details['labels']['en']['value']
                label_list[prop] = prop_label
            except:
                non_labelled_props.append(prop)
                list_properties_copy.remove(prop)
    
    list_properties = list_properties_copy
    
    #For all values per property, label is extracted for each value ID [Q##### format]
    for prop in list_properties:
        
        labelled_values = []
       
        for value in list_prop_value[prop]:
            
            #Entities which directly have a value instead of an ID for an entity are saved
            if value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], str):
                labelled_values.append(value['mainsnak']['datavalue']['value'])
            
            #There are entities whose values are in form of Dictionary of values and not a string. These types
            #are checked here
            elif value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], dict):
                
                #Extracting labels for Entities saved in terms of IDs
                if 'id' in value['mainsnak']['datavalue']['value'].keys():
                    
                    value_id = value['mainsnak']['datavalue']['value']['id']
                    value_label = ''
                    
                    #Extracted labels are stored in label_list for faster computation
                    if value_id not in label_list.keys():
                        
                        value_details = get_entity_dict_from_api(value_id)
                        
                        if 'en' in value_details['labels'].keys():
                            value_label = value_details['labels']['en']['value']
                            label_list[value_id] = value_label

                    if value_label != '' or value_id in label_list.keys():
                        labelled_values.append(label_list[value_id])
                    
                else:
                    labelled_values.append(value['mainsnak']['datavalue']['value'])
        
        if labelled_values:
            monument_labelled_prop_val[label_list[prop]] = labelled_values
    
    labelled_monument_list[monument['id']] = monument_labelled_prop_val
    
    count_monuments = count_monuments + 1
    
    if count_monuments%20 == 0:
        partition_num = str(count_monuments/20).split('.')[0]
        with open('../Data/English_Labelled_Wikidata/en_labelled_part'+ partition_num + '.json', 'w') as fout:
            json.dump(labelled_monument_list, fout)
        
        labelled_monument_list = {}
        
        print("Checkpoint %d reached, JSON dumps saved |" % (count_monuments/20), end = ' ')
        print("Time Elapsed:", end = ' ')
        print(time.time()-t0)
    
t1 = time.time()
total = t1-t0

Checkpoint 72 reached, JSON dumps saved | Time Elapsed: 53.94593405723572
Checkpoint 73 reached, JSON dumps saved | Time Elapsed: 97.10320663452148
Checkpoint 74 reached, JSON dumps saved | Time Elapsed: 186.917578458786
Checkpoint 75 reached, JSON dumps saved | Time Elapsed: 224.09359312057495
Checkpoint 76 reached, JSON dumps saved | Time Elapsed: 266.08176708221436
Checkpoint 77 reached, JSON dumps saved | Time Elapsed: 367.77900671958923
Checkpoint 78 reached, JSON dumps saved | Time Elapsed: 459.12752771377563
Checkpoint 79 reached, JSON dumps saved | Time Elapsed: 519.670024394989
Checkpoint 80 reached, JSON dumps saved | Time Elapsed: 581.9177811145782
Checkpoint 81 reached, JSON dumps saved | Time Elapsed: 654.8344459533691
Checkpoint 82 reached, JSON dumps saved | Time Elapsed: 729.4829385280609
Checkpoint 83 reached, JSON dumps saved | Time Elapsed: 763.3744251728058
Checkpoint 84 reached, JSON dumps saved | Time Elapsed: 800.7547416687012
Checkpoint 85 reached, JSON dumps sa

In [26]:
print(total)
len(labelled_monument_list)

1441.9020428657532


11

#### 4. Extracting Labels for left over IDs

In [4]:
labelled_wikidata_en = {}
for i in range(1,102):
    with open('../Data/English_Labelled_Wikidata/en_labelled_part'+str(i)+'.json') as f:
        labelled_wikidata_en.update(json.load(f))

In [5]:
left_over = {}
for key in wikipage_monument_dict.keys():
    if key not in labelled_wikidata_en.keys():
        left_over[key] = wikipage_monument_dict[key]

In [7]:
labelled_monument_list_left_over = {}
index = 0
label_list = {}
non_labelled_props = ['P727']

for monument_id in left_over.keys():
    
    monument = monument_dict[monument_id]
    
    monument_labelled_prop_val = {}
    list_prop_value = monument['claims']
    list_properties = list(list_prop_value.keys())
    
    #Removing Properties from list of properties which dont have a wikidata page
    for prop in non_labelled_props:
        if prop in list_properties:
            list_properties.remove(prop)
    
    list_properties_copy = list_properties
    
    #Adding all properties to label list. The ones which dont have wikidata pages are stored in non_labelled_props 
    for prop in list_properties:
        if prop not in label_list.keys():
            try:
                prop_details = get_entity_dict_from_api(prop)
                prop_label = prop_details['labels']['en']['value']
                label_list[prop] = prop_label
            except:
                non_labelled_props.append(prop)
                list_properties_copy.remove(prop)
    
    list_properties = list_properties_copy
    
    #For all values per property, label is extracted for each value ID [Q##### format]
    for prop in list_properties:
        
        labelled_values = []
       
        for value in list_prop_value[prop]:
            
            #Entities which directly have a value instead of an ID for an entity are saved
            if value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], str):
                labelled_values.append(value['mainsnak']['datavalue']['value'])
            
            #There are entities whose values are in form of Dictionary of values and not a string. These types
            #are checked here
            elif value['mainsnak']['snaktype'] == 'value' and isinstance(value['mainsnak']['datavalue']['value'], dict):
                
                #Extracting labels for Entities saved in terms of IDs
                if 'id' in value['mainsnak']['datavalue']['value'].keys():
                    
                    value_id = value['mainsnak']['datavalue']['value']['id']
                    value_label = ''
                    
                    #Extracted labels are stored in label_list for faster computation
                    if value_id not in label_list.keys():
                        
                        value_details = get_entity_dict_from_api(value_id)
                        
                        if 'en' in value_details['labels'].keys():
                            value_label = value_details['labels']['en']['value']
                            label_list[value_id] = value_label

                    if value_label != '' or value_id in label_list.keys():
                        labelled_values.append(label_list[value_id])
                    
                else:
                    labelled_values.append(value['mainsnak']['datavalue']['value'])
        
        if labelled_values:
            monument_labelled_prop_val[label_list[prop]] = labelled_values
    
    index+=1
    print(index)
    
    labelled_monument_list_left_over[monument['id']] = monument_labelled_prop_val

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


In [8]:
with open('../Data/English_Labelled_Wikidata/en_labelled_part102.json', 'w') as fout:
    json.dump(labelled_monument_list_left_over, fout)