In [1]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [2]:
import json
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import re 
import requests
import sparql
import spacy 
import spotlight


from pathlib import Path
import warnings

from spacy import displacy
import networkx as nx
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf 
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
TOKEN = "f70577ab-c371-4fb8-83f3-e82791f9b1a1-843339462"

In [5]:
def tagme_annotation(token, question):
    ann_list = []
    response = requests.get("https://tagme.d4science.org/tagme/tag?lang=en&gcube-token={}&text={}".format(token, question))

    annotations = {}
    if response.status_code == 200 :
      for annotation in json.loads(response.text)['annotations']: 
             
        annotations[('http://dbpedia.org/resource/' + annotation['title'].replace(' ', '_'))] = annotation['rho']
    else: 
      annotations.append('')

    return sorted(annotations.items(), key=lambda x: x[1])[-1][0]

In [6]:
def NEL_spotlight(question):

    question_annotations = []
    try: 
        annotations = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate', question, confidence=0.4, support=20) 
        question_annotations.append(annotations[0]['URI']) 
    except:
        pass

    return question_annotations

In [7]:
tagme_annotation(TOKEN, 'drama')

'http://dbpedia.org/resource/Drama'

In [8]:
import os
import tarfile
 
def unpack_model(model_name):
  tar = tarfile.open(f"{model_name}.tar.gz", "r:gz")
  tar.extractall()
  tar.close()
 
model_transf = unpack_model('model_intent')

In [9]:
model_transf

In [10]:
from transformers import BertForSequenceClassification

In [11]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=3)
model_transf = ClassificationModel("bert", 'outputs', num_labels = 8, args = model_args)

In [12]:
predictions, raw_outputs = model_transf.predict(["what is the genre of a book written by j.k rowling"])

100%|██████████| 1/1 [00:00<00:00, 102.43it/s]
100%|██████████| 1/1 [00:00<00:00, 33.67it/s]


In [13]:
predictions

array([5])

In [14]:
#type of intents
class_dict = {0 : 'author_of_book', 1 : 'book_from_author', 2 : 'books_in_genre', 3 : 'country', 4: 'date', 5: 'genre', 6 : 'language', 7 : 'publisher'}
#map intents to ontology or properties
dbpedia_dict = {0 : 'author', 1 : 'author', 2 : 'literaryGenre', 3 : 'country', 4 : 'releaseDate', 5 : 'genre', 6 : 'language', 7: 'publisher'}
dbo_book = [1,2,7]

In [15]:
def get_predictions(question):
    predictions, raw_outputs = model_transf.predict([question])
    
    return class_dict[predictions[0]]

In [16]:
get_predictions("what type of book was written by stephen king in 2010")

100%|██████████| 1/1 [00:00<00:00, 83.08it/s]
100%|██████████| 1/1 [00:00<00:00, 38.04it/s]


'genre'

In [17]:
def get_ent_char(question):
    ent_list = []
    doc = nlp(question)
    for ent in doc.ents:
        ent_list.append((ent.text, ent.label_, ent.start_char, ent.end_char))
    return ent_list

In [18]:
get_ent_char("what type of book was written by stephen king in 2010")

[('stephen king', 'PERSON', 33, 45), ('2010', 'DATE', 49, 53)]

In [19]:
q_1 = ["who is the author of alice in wonderland",  "what book has Stephen King written", "what is the genre of harry potter", "who published the little prince", "in what language was adventures in narnia written", "what is a book of drama"]

In [20]:
[(q, get_ent_char(q)) for q in q_1]

[('who is the author of alice in wonderland', [('alice', 'PERSON', 21, 26)]),
 ('what book has Stephen King written', [('Stephen King', 'PERSON', 14, 26)]),
 ('what is the genre of harry potter', [('harry potter', 'PERSON', 21, 33)]),
 ('who published the little prince', []),
 ('in what language was adventures in narnia written',
  [('narnia', 'PERSON', 35, 41)]),
 ('what is a book of drama', [])]

Spacy do not recognize genre and some books. 
Books ares identified as person

In [21]:
output_dir = Path('/home/aliciescont/Documents/tfm_code/QA_eval/SimpleDBpediaQA/V1')
nlp2 = spacy.load(output_dir)

In [22]:
def get_match_entity(question):
    objects = [str(tok).lower() for tok in nlp(question) if (tok.dep_  in [ 'dobj', 'pobj']) ]
    
    entities = get_ent_char(question)
    if len(objects) == 1: 
        i = 0
    else:
        i=1
    for word in objects[i:]:
        o = re.search(str(word), question)
        start, end = o.start(), o.end()
        if word not in str(entities) and word not in ['book', 'books', 'genre']:
            entities.append((word, '', start, end))
            
    return entities

In [23]:
[(q, get_match_entity(q)) for q in q_1]

[('who is the author of alice in wonderland',
  [('alice', 'PERSON', 21, 26), ('wonderland', '', 30, 40)]),
 ('what book has Stephen King written', [('Stephen King', 'PERSON', 14, 26)]),
 ('what is the genre of harry potter', [('harry potter', 'PERSON', 21, 33)]),
 ('who published the little prince', [('prince', '', 25, 31)]),
 ('in what language was adventures in narnia written',
  [('narnia', 'PERSON', 35, 41)]),
 ('what is a book of drama', [('drama', '', 18, 23)])]

In [24]:
q_2 = ["who is the author of the black cat and the raven", "what books have Stephen King and Petter Straub written", "what is the genre of HArry potter and the little prince", "in what language was harry potter and narnia written", "what are some books of kids and drama", "in what country was piblisher harry potter and lord of the rings" ]

In [25]:
[(q, get_ent_char(q)) for q in q_2]

[('who is the author of the black cat and the raven', []),
 ('what books have Stephen King and Petter Straub written',
  [('Stephen King', 'PERSON', 16, 28), ('Petter Straub', 'PERSON', 33, 46)]),
 ('what is the genre of HArry potter and the little prince',
  [('HArry potter', 'PERSON', 21, 33)]),
 ('in what language was harry potter and narnia written',
  [('harry potter', 'PERSON', 21, 33), ('narnia', 'PERSON', 38, 44)]),
 ('what are some books of kids and drama', []),
 ('in what country was piblisher harry potter and lord of the rings',
  [('piblisher harry potter', 'PERSON', 20, 42)])]

In [26]:
[(q, get_match_entity(q)) for q in q_2]

[('who is the author of the black cat and the raven', [('cat', '', 31, 34)]),
 ('what books have Stephen King and Petter Straub written',
  [('Stephen King', 'PERSON', 16, 28), ('Petter Straub', 'PERSON', 33, 46)]),
 ('what is the genre of HArry potter and the little prince',
  [('HArry potter', 'PERSON', 21, 33)]),
 ('in what language was harry potter and narnia written',
  [('harry potter', 'PERSON', 21, 33), ('narnia', 'PERSON', 38, 44)]),
 ('what are some books of kids and drama', [('kids', '', 23, 27)]),
 ('in what country was piblisher harry potter and lord of the rings',
  [('piblisher harry potter', 'PERSON', 20, 42), ('rings', '', 59, 64)])]

In [27]:
q_3 = ["who is the author of a book of drama written in 2000", "what books has the author of the shinning written in english", "what is the genre of a book written by stephen king", "what is the publication language of alice in wonderland", "in what country was published a book written by Lewis Caroll", "what are some books of drama published in 2015", "when was a book written by edgar allan poe published", "where was a book of drama written in 2000 published"]

In [28]:
[(q, get_ent_char(q)) for q in q_3]

[('who is the author of a book of drama written in 2000',
  [('2000', 'DATE', 48, 52)]),
 ('what books has the author of the shinning written in english',
  [('english', 'LANGUAGE', 53, 60)]),
 ('what is the genre of a book written by stephen king',
  [('stephen king', 'PERSON', 39, 51)]),
 ('what is the publication language of alice in wonderland',
  [('alice', 'PERSON', 36, 41)]),
 ('in what country was published a book written by Lewis Caroll',
  [('Lewis Caroll', 'PERSON', 48, 60)]),
 ('what are some books of drama published in 2015',
  [('2015', 'DATE', 42, 46)]),
 ('when was a book written by edgar allan poe published',
  [('edgar allan poe', 'PERSON', 27, 42)]),
 ('where was a book of drama written in 2000 published',
  [('2000', 'DATE', 37, 41)])]

In [29]:
def get_dependencies (question):
    doc = nlp(question)
    dep_dict = {}
    for key, value in enumerate(doc):
        dep_dict[key] = value.dep_, [child.i for child in value.children]
    return dep_dict

In [30]:
get_dependencies("where was ppublished a book of drama")

{0: ('advmod', []),
 1: ('auxpass', []),
 2: ('ROOT', [0, 1, 4]),
 3: ('det', []),
 4: ('dobj', [3, 5]),
 5: ('prep', [6]),
 6: ('pobj', [])}

In [31]:
get_match_entity("where was published a book of drama")

[('drama', '', 30, 35)]

In [32]:
[(q, get_match_entity(q)) for q in q_2]

[('who is the author of the black cat and the raven', [('cat', '', 31, 34)]),
 ('what books have Stephen King and Petter Straub written',
  [('Stephen King', 'PERSON', 16, 28), ('Petter Straub', 'PERSON', 33, 46)]),
 ('what is the genre of HArry potter and the little prince',
  [('HArry potter', 'PERSON', 21, 33)]),
 ('in what language was harry potter and narnia written',
  [('harry potter', 'PERSON', 21, 33), ('narnia', 'PERSON', 38, 44)]),
 ('what are some books of kids and drama', [('kids', '', 23, 27)]),
 ('in what country was piblisher harry potter and lord of the rings',
  [('piblisher harry potter', 'PERSON', 20, 42), ('rings', '', 59, 64)])]

In [33]:
#get shortest path in dependency tree to find secondary triples

def get_graph(question, entity1, entity2):
    doc = nlp(question)
    edges = []
    for token in doc:
        for child in token.children:
            edges.append(('{0}'.format(token.lower_),'{0}'.format(child.lower_)))
    graph = nx.Graph(edges)
    
    short_path = nx.shortest_path(graph, source=entity1, target=entity2)
    return short_path
    
    

In [34]:
def get_name_dep_dict(question):
    name_dep = dict()
    doc = nlp(question)
    token_list = [token.text for token in doc]
    dep_list = [(token.dep_, [child.text for child in token.children] )for token in nlp(question)]
    for token, dep in zip(token_list, dep_list):
        name_dep[token] = dep
    return name_dep

In [35]:
def get_pos_dict(question):
    pos_dict = dict()
    doc = nlp(question)
    token_list = [token.text for token in doc]
    token_pos = [token.pos_ for token in doc]
    for word, pos in zip(token_list, token_pos):
        pos_dict[word] = pos
    return pos_dict

In [36]:

resource_dict = {0 : 'entity', 1: 'unk', 2: 'unk', 3: 'entity', 4: 'entity', 5: 'entity', 6: 'entity', 7: 'entity'}
predicate_dict = {0 : ' dbo:author ', 1 : ' dbo:author ', 2 : ' dbo_literaryGenre ', 3 : ' rdf:type dbo:Book; dbp:country ', 4 : ' dbo:releaseDate ', 5 : ' rdf:type dbo:Book; dbp:genre ', 6 : ' rdf:type dbo:Book; dbp:language ', 7: ' dbo:publisher'}
object_dict = { 0 : 'unk' , 1: 'entiis the genre of harry potter and narniaty', 2: 'entity', 3 : 'unk', 4 : 'unk', 5 : 'unk', 6: 'unk', 7: 'unk'}

#{0 : 'author', 1 : 'author', 2 : 'literaryGenre', 3 : 'country', 4 : 'releaseDate', 5 : 'genre', 6 : 'language', 7: 'publisher'}

In [37]:
def ask_one_pred(intent, entity):
    
    resource = resource_dict[intent]
    object_ = object_dict[intent]
    prefix = ''
    prefix = 'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX dbo: <http://dbpedia.org/ontology/> SELECT DISTINCT ?o WHERE { '
    if resource == 'entity':
        if len(entity) == 1:
            query = prefix + '<'+entity[0]+'>' + predicate_dict[intent] + '?o }'
        elif len(entity) > 1:
            #check type of entity custom ner
            query = prefix + '{ <'+entity[0]+'>' + predicate_dict[intent] + '?o  } UNION { <'+entity[1]+'>' + predicate_dict[intent] + '?o  } }'
            
        elif len(entity) == 0:
            query = prefix + ' ' + predicate_dict[intent]+ '?o }'
    else:
        if len(entity) == 1:
            query = prefix + ' ?o' + predicate_dict[intent] + '<' +entity[0]+ '> }'
        elif len(entity) > 1:
            query = prefix + '{ ?o' + predicate_dict[intent] + '<'+entity[0]+'> } UNION { ?o>' + predicate_dict[intent] + '<'+entity[1]+'> } }'
        elif len(entity) == 0:
            query = prefix + '?o ' + predicate_dict[intent]+ '  }'
    return query




In [38]:
def create_query(triples):
    #one predicate
    prefix = 'PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX dbo: <http://dbpedia.org/ontology/> SELECT DISTINCT ?o WHERE { '
    if len(triples)== 1:
        
        triple = ask_one_pred(triples[0][1], triples[0][0])
        query = prefix + triple
    else:
        query = prefix + ask_two_pred(triples)
    return query

In [39]:
def make_query(query):
    result = sparql.query('http://dbpedia.org/sparql', query) 
    row_list = []
    for row in result:
        values = sparql.unpack_row(row)
        row_list.append(values)
    return row_list

In [40]:
def clean_entity(entity):
    result = entity.replace("http://dbpedia.org/resource/", '')
    result = result.replace("_", " ")
    return result

In [41]:
#answer block
def get_answer(intent, entity, result):
    subject = dbpedia_dict[intent]
    book_intents = [0, 3, 4, 5, 6, 7]
    if intent in book_intents:
        if intent == 4:
            #date 
            answer = " was released on " + result
        elif intent == 3:
            #country
            answer = " was published in " + result
        else:
      #author/pub of a book
            answer = "The " + subject + " is : "  + result 
    
    elif intent == 1:
      #books_from_author
        answer = "Here are some books that match your criteria : " + result
        #the entity(author) has written
    elif intent == 2:
      #books_from_genre
        answer = "Here are some books that belongs to that genre : " + result
        #here are some books that follows criteria selected

    return answer

In [53]:
def qa_book(question):
    #question = input()
    #get NEL
    entities = get_match_entity(question)
    question_dep = get_name_dep_dict(question)
    entity = [tagme_annotation(TOKEN, str(ent)) for ent in entities]
    
    if entity == ' ':
        entity = NEL_spotlight(str(entity), 0.3)
    #two entities with a conjunction but only one detected
    ent_conj = [word for word, dep in question_dep.items() if dep[0] =='conj']
    if ent_conj != []:
        for w in ent_conj:
            if w not in entities:
                entities.append(ent_conj)
    print(entities)
    #get intent
    predictions = model_transf.predict([question])
    intent = predictions[0][0]
    print(intent)
    #select sparql query
    query = ask_one_pred(intent, entity)
    print(query)
    #make query
    result = make_query(query)
    result_list = []
    for res in result:
        res = res.replace("http://dbpedia.org/resource/", '')
        res = res.replace("_", " ")
        result_list.append(res) 
    
    result = (' , '.join(map(str, result_list)))

    #get answer
    if result == '':
        answer = "Sorry, I'm not able to answer this question"
    else:
        answer = get_answer(intent, clean_entity(entity), result)    


    return question, entity, intent, class_dict[predictions[0][0]], query

In [55]:
get_match_entity("when was publish a book written by j.k rowling")


[('j.k rowling', 'PERSON', 35, 46)]

In [56]:
from spacy import displacy
displacy.render(nlp("what is the genre of a book written by j.k rowling in english"), style="dep")