In [63]:
import re
from wikimapper import WikiMapper
import rdflib
from string import capwords
from titlecase import titlecase

In [2]:
from templater3 import Templater
documents_to_learn = ['<b> spam and eggs </b>', '<b> ham and spam </b>',
                          '<b> white and black </b>'] # list of documents
template = Templater()
for document in documents_to_learn:
    template.learn(document)

print('Template created:', template._template)


document_to_parse = '<b> yellow and blue </b>'
print('Parsing other document:', template.parse(document_to_parse))

print('Filling the blanks:', template.join(['', 'red', 'orange', '']))

Template created: [None, '<b> ', None, ' and ', None, ' </b>', None]
Parsing other document: ['', 'yellow', 'blue', '']
Filling the blanks: <b> red and orange </b>


In [90]:
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

def get_instance_of(entity: str) -> 'list[str]':
   instance_of = []
   #wdt:P31 -> is instance of
   #classLabel is used to retrieve the actual labels instead of ambigiuous entities i.e. Q123


   query = '''SELECT ?classLabel WHERE {{
   wd:{entity} wdt:P31 ?class.
      SERVICE wikibase:label {{
      bd:serviceParam wikibase:language "en" .
      }}
   }}'''.format(entity=entity)
   sparql.setQuery(query)

   sparql.setReturnFormat(JSON)
   results = sparql.query().convert()
   for result in results['results']['bindings']:
      instance_of.append(result['classLabel']['value'])
   return instance_of

def preprocess_entity(entity: str) -> str:
   if entity.find("|"):
      entity = entity.split("|")
      entity = entity[0]

   title_case_entity = titlecase(entity).replace(' ', '_')

   joined = entity.replace(' ', '_')

   capitalized_entity = entity.replace(' ', '_').capitalize()
   
   return title_case_entity, capitalized_entity, joined
      


In [101]:
lines = open('simplewiki-20211120-lists-1k.tsv').readlines()
mapper = WikiMapper("index_enwiki-latest.db")
counter = 0
entities_counter = 0
for line in lines:
    file, word_list = line.split('\t')
    word_list = word_list.split('\\n')
    for word in word_list:
        entities = re.findall(r'\[\[(.+?)\]\]',word)
        for entity in entities:
            entities_counter+=1
            title_case, capitalized, joined = preprocess_entity(entity)
            id = mapper.title_to_id(title_case)
            if id is None:
                id = mapper.title_to_id(capitalized)
                if id is None:
                    id = mapper.title_to_id(joined)
                    if id is None:
                        counter+=1
                        #print(title_case, capitalized, joined)
                    
print(counter/entities_counter)

        

0.024498545398866943


In [87]:
mapper.title_to_id("Amir_Khan_Mutaqqi")

In [93]:
mapper.id_to_titles("Q4503831")

['Object_(task)',
 'Goal_(management)',
 'Short_term_goal',
 'Short-term_goal',
 'Long_term_goal',
 'Long-term_goal',
 'Objective_(goal)',
 'Goal',
 'Objective_(military)',
 'Primary_objective',
 'Subgoal',
 'Personal_goals',
 'Goal_management_in_organizations',
 'Goal_displacement']