In [119]:
import requests
import importlib
import data_utils
import tools_utils
from llama_cpp import Llama
import re

#  to reload external modules
importlib.reload(data_utils)
importlib.reload(tools_utils)

<module 'tools_utils' from 'c:\\Users\\paolo\\Desktop\\gotriple-keyword-translation-main\\tools_utils.py'>

This notebook contains functions to use tools to map GoTriple keywords to WikiData pages. Each function takes as input an item from the list produced by the function get_sample in data_utils.py (each item contains information about the title, the abstract and the keywords of the article, see data_utils.py for further details)

- The first function uses DBPedia Spotlight. It maps keywords to DBPedia resources (each keyword is mapped to the DBPedia correspondent to the keywords language) if the language of the keyword is different than 'en' (English). 
    - The parameter 'context' specifies the text that is given as input to DBPedia Spotlight. If True, the title and abstract are given as additional context (this slows execution since DBPedia Spotlight annotates the whole input). If False, only keywords are given as input
    - If keywords are in English, a further mapping from DBPedia to Wikidata resources is performed (using SPARQL). This feature is not available for other languages for what it seems like a lack of semantic annotation (the SPARQL engine is not available for DBPedia corresponding to some languages and there is shortage of annotated links between pages in different languages). Moreover, it should be noted that the performance of DBPedia Spotlight seems to be poorer when it is used in languages other than English (THIS NEEDS VERIFICATION).
    - The function returns a list where each element corresponds to an annotation. Each element of the returned list has three keys: 'Form' (specifies the surface form that DBPedia Spotlight has linked to the URI), 'DBPediaURI' and 'WikiDataURI' (contains None if the language of the keywords is not English) 

In [2]:
"""
example of calling the function:
data = data_utils.get_sample(['en', 'fr', 'es'], 300)
test_item = data[0]
output = useDBPediaSpotlight(test_item, False)
"""

def useDBPediaSpotlight(item, context):

    results = []

    #  useful to retrieve entities relevant to the keywords (and not to the context) in case of search with context
    keywords_token = [token for kw in item['Keywords'] for token in kw.split(' ')]

    abstract = item['Abstract_or'] if item['Abstract_or'] else ""
    title = item['Title_or'] if item['Title_or'] else ""

    #  prepare the text for the query
    if context:
        text = 'Title: ' + title + '. ' + 'Abstract: ' + abstract + '. Keywords: ' + ", ".join(item['Keywords'])
    else:
        text = ", ".join(item['Keywords'])

    print(text)

    #  send a request to DBPedia Spotlight API
    data = tools_utils.queryAPIDBpediaSpotlight(text, item['Language'])['Resources']

    #  processes the output to retain only entities corresponding to keywords
    for entity in data:
        if context:
            if entity['@surfaceForm'] in keywords_token:
                results.append({'Form': entity['@surfaceForm'], 'DBPediaURI': entity['@URI']})
        else: 
            results.append({'Form': entity['@surfaceForm'], 'DBPediaURI': entity['@URI']})

    #  conversion of DBPedia URIs in Wikidata URIs
    for result in results:
        if item['Language'] == 'en':
            result['WikidataURI'] = tools_utils.get_wikidata_uri(result['DBPediaURI'])
        else: 
            result['WikidataURI'] = None

    
    #  remove duplicates
    words_in_results = []
    final_results = []
    for result in results:
        if result['Form'] not in words_in_results:
            words_in_results.append(result['Form'])
            final_results.append(result)
    
    return results

- The second function uses LLMs. In this notebook, we use Llama.cpp, a library where various LLMs are implemented in C++ in order to allow faster inference times even on CPU. The library allows inference on variety of quantized LLMs available on HuggingFace in GGUF format. We use the library in order to allow replicability of the code without requiring specialized software. Specifically, we use llama-cpp-python, a Python wrapper of the library.
    - The function takes as input item (an item from the list produced by the function get_sample in data_utils.py), model (a model loaded via the Python wrapper of the Llama.cpp library), and context (controls the context that we provide to the model: "Title" if we want to provide the title with no abstract, "All" if we want to provide the title and the abstract, otherwise we will provide only the keywords). Providing the abstract slows performance since the prompt is longer. A good compromise is to provide only the title in order to limit the length of the prompt and still provide context to the model
    - The function prompts the model for the Wikidata entities corresponding to the keywords and then performs a query on Wikidata using the WikiData API (the requests and the method for searching the best fit are in a function in tools_utils.py). It returns a list of dictionaries where each dictionary has a field 'Keyword' and a field 'URI' (the second has no value if the query gives no result)

NB: This feature is only for experimentation since it has very slow response time.  

In [129]:
"""
example of calling the function:
data = data_utils.get_sample(['en', 'fr', 'es'], 300)
test_item = data[0]
llm = tools_utils.loadLLM() (load by default 4-bit quantization of Mistral-7B-Instruct)
useLLM(data[35], llm, context="Title")
"""


def useLLM(item, model, context):
    if context == "Title":
        prompt = """<s>[INST] {{Map each keyword of the article to one or more relevant WikiData entities.
        Keywords are from a scientific article. 
        The title of the article is {}.
        The keyword list is: {}. 
        An example of answer for the article with the title "Russian formalists and Russian literature"
        and the list of keywords: literary life, literary fact, doing things
        is: literary life: [literature]; literary fact: [literature], [fact]; doing things: [activity]
        INCLUDE EACH SEPARATE ENTITY BETWEEN [] IN THE ANSWER }} [/INST]
    """.format(item['Title_or'], ", ".join([kw for kw in item['Keywords']]))   
    if context == "All":
        prompt = """<s>[INST] {{Map each keyword of the article to one or more relevant WikiData entities.
        Keywords are from a scientific article. 
        The title of the article is {}.
        The abstract of the article is {}.
        The keyword list is: {}. 
        An example of answer for the article with the title "Russian formalists and Russian literature"
        and the list of keywords: literary life, literary fact, doing things
        is: literary life: [literature]; literary fact: [literature], [fact]; doing things: [activity]
        INCLUDE EACH SEPARATE ENTITY BETWEEN [] IN THE ANSWER }} [/INST]
    """.format(item['Title_or'], item['Abstract_or'], ", ".join([kw for kw in item['Keywords']])) 
    else:
        prompt = """<s>[INST] {{Map each keyword to one or more relevant WikiData entities.
        Keywords are from a scientific article. 
        The keyword list is: {}. 
        An example of answer for the list of keywords: literary life, literary fact, doing things
        is: literary life: [literature]; literary fact: [literature], [fact]; doing things: [activity]
        INCLUDE EACH SEPARATE ENTITY BETWEEN [] IN THE ANSWER }} [/INST]
    """.format(", ".join([kw for kw in item['Keywords']]))


    output = model(
      prompt,
      max_tokens=200, 
    ) 

    #  formatting of the answer (this procedure is dependent on the output form we impose via the prompt.)
    entities = re.findall(r'\[([^\]]+)\]', output['choices'][0]['text'])

    results = []
    for entity in entities:
        item = {}
        item['Keyword'] = entity
        print(entity.lower().split("()")[0])
        uri = tools_utils.query_wikidata(entity.lower().split("()")[0])
        if uri:
            item['URI'] = uri['concepturi']
        else:
            item['URI'] = ''
        results.append(item)

    return results