In [None]:
import json
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import re 
import spotlight
import requests
import tensorflow as tf
from random import sample
import sparql
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

# SimpleDBpediaQA

In [None]:
#Load data

with open('data/SimpleDBpediaQA/train.json') as f:
    train = json.load(f)
with open('data/SimpleDBpediaQA/test.json') as f:
    test = json.load(f)
with open('data/SimpleDBpediaQA/valid.json') as f:
    valid = json.load(f)



In [None]:
train_df = pd.DataFrame(train['Questions'])
valid_df = pd.DataFrame(valid['Questions'])
test_df = pd.DataFrame(test['Questions'])

In [None]:
#Merge all data from SimpleDBpediaQA to filter questions related to books
simple_df = pd.concat([train_df, valid_df, test_df])

In [None]:
simple_df

In [None]:
simple_df.isna().sum()

In [None]:
#dictionary of conversion from Freebase to DBpedia
pred_dict = {}
for key, value in zip(simple_df['FreebasePredicate'], simple_df['PredicateList']):
    pred_dict[key] = value[0]['Predicate']

In [None]:
pred_dict

In [None]:
#DBpedia predicates to be filtered
ont_list = ['http://dbpedia.org/ontology/notableWork', 'http://dbpedia.org/ontology/WrittenWork', 'http://dbpedia.org/ontology/author' , 'http://dbpedia.org/ontology/publisher',  'http://dbpedia.org/ontology/subsequentWork', 'http://dbpedia.org/ontology/Country', 'http://dbpedia.org/property/author', 'http://dbpedia.org/ontology/illustrator', 'http://dbpedia.org/property/notableworks>', 'http://dbpedia.org/ontology/nationality', 'http://dbpedia.org/ontology/releaseDate', 'http://dbpedia.org/ontology/birthDate', 'http://dbpedia.org/ontology/birthName', 'http://dbpedia.org/ontology/birthPlace', 'http://dbpedia.org/ontology/occupation', 'http://dbpedia.org/ontology/influencedBy', 'http://dbpedia.org/ontology/literaryGenre', 'http://dbpedia.org/ontology/country','http://dbpedia.org/ontology/notableWork', 'http://dbpedia.org/ontology/previousWork']


In [None]:
#dictionary of freebase and DBpedia translation filtered
for key, value in pred_dict.items():
    if value in ont_list:  
        print(key + ' --> ' + value)
    

In [None]:
#replace entity by their label
def replace_entity(data_list):
    replace_list = []
    for question in data_list:
        doc = nlp(question)
        for ent in doc.ents:
          
            #ent_label = str(ent.label_)
            ent_label = 'ENT'
            question = question.replace(question[ent.start_char:ent.end_char], ent_label)
            replace_list.append(question)
    return replace_list

In [None]:
#function to filter predicate based on regex exp
def regex_filter(exp,val):
    if val:
        regex = re.search(exp,val)
        if regex:
            return True
        else:
            return False
    

In [None]:
#transform predicate to a domain filter by the first word after www.freebase.com/
simple_df['theme'] = simple_df['FreebasePredicate']
simple_df['theme'] = simple_df['theme'].str.replace("www.freebase.com/","")
simple_df['theme'] = simple_df['theme'].apply(lambda x : re.sub(r"(/.*)", '', x))

In [None]:
#distribution plot of simple_df by domain
fig = plt.figure(figsize=(16,8))
ax = simple_df['theme'].value_counts().sort_index(ascending=True)
ax.plot(kind='bar')
fig.savefig('images/simple_theme.png', bbox_inches = 'tight')

In [None]:
#sentence length of SimpleDbpediaQA dataset
fig = plt.figure(figsize=(12,6))
plt.hist([len(question.split()) for question in simple_df['Query'] ], bins= 25)
fig.savefig('images/len_question_all.png')

In [None]:
max([len(question.split()) for question in simple_df['Query'] ])

## Book questions of SimpleDBpediaQA

In [None]:
#predicates related to books, but not contain in freebase book domain. Some of them are not directly related to books, but tha amount of available question shouldn't affect the classification
pred_book = ['www.freebase.com/media_common/literary_genre/books_in_this_genre', 'www.freebase.com/cvg/computer_videogame/publisher', 'www.freebase.com/film/film/country']

In [None]:
#select predicates related to freebase domain books
book_set = set([pred for pred in simple_df['FreebasePredicate'] if regex_filter(r'^www.freebase.com/book(.*)', pred) == True])

In [None]:
publisher_list = simple_df['Query'][simple_df['FreebasePredicate'] == 'www.freebase.com/cvg/computer_videogame/publisher']

In [None]:
#get similarity between a target type of questions and an input question
def make_similar(reference, input_q):
    q_vec = []
    input_vec = nlp(input_q)

    for question in reference:
        question_vec = nlp(question)
        q_vec.append(input_vec.similarity(question_vec))
        return max(q_vec)

In [None]:
#filtering predicates relating to newspaper.
book_set = ['www.freebase.com/book/written_work/original_language',
 'www.freebase.com/book/book/genre',
 'www.freebase.com/book/author/works_written',
 'www.freebase.com/book/written_work/author']

In [None]:
#Drop predicates related to newspapers and add other type of predicates not directely related to book domain, but can be asked
book_filter = book_set + pred_book
book_filter

In [None]:
#filtering dataset to the selected predicates
train_book = train_df[train_df['FreebasePredicate'].isin(book_filter)]
valid_book = valid_df[valid_df['FreebasePredicate'].isin(book_filter)]
test_book = test_df[test_df['FreebasePredicate'].isin(book_filter)]

In [None]:
#merge train, valid and test of the subset
simple_df_book = pd.concat([train_book, test_book, valid_book])
simple_df_book = simple_df_book.reset_index(drop=True)
simple_df_book.head()

In [None]:
simple_df_book['FreebasePredicate'].value_counts()

In [None]:
#use predicate from Freebase to identify intents
simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/book/book/genre'] = 'genre'
simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/media_common/literary_genre/books_in_this_genre'] = 'books_in_genre'
simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/book/written_work/author'] = 'author_of_book'
simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/book/author/works_written'] = 'book_from_author'
simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/book/written_work/original_language'] = 'language'
simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/cvg/computer_videogame/publisher'] = 'publisher'

simple_df_book['FreebasePredicate'].loc[simple_df_book['FreebasePredicate'] == 'www.freebase.com/film/film/country'] = 'country'


In [None]:
len(simple_df_book)/len(simple_df)

In [None]:
simple_df_book['FreebasePredicate'].value_counts()

In [None]:
#distribution plot of filtered dataset
fig = plt.figure(figsize=(12,6))
ax = simple_df_book['FreebasePredicate'].value_counts().sort_index(ascending=True)
ax.plot(kind='bar')
fig.savefig('images/data_book.png', bbox_inches = 'tight')

In [None]:
#question length for subset dataset -> change to complete dataset
fig = plt.figure(figsize=(10,6))
plt.hist([len(question.split()) for question in simple_df_book['Query']], bins= 15)
fig.savefig('images/len_question.png')

In [None]:
max([len(question.split()) for question in simple_df_book['Query']])

In [None]:
simple_df_book.to_csv('data_outputs/book_df.csv', index=False)

## Improving dataset of books by question generation of abstracts of books in DBpedia

In [None]:
#create a dataframe from abstracts
abstract_query = sparql.query('http://dbpedia.org/sparql', """SELECT DISTINCT ?s ?label
WHERE {  ?s rdf:type dbo:Book .
  ?s dbo:abstract ?label .
    FILTER (lang(?label) = 'en')}
""") 

In [None]:
abstract_list = []
for row in abstract_query:
    values = sparql.unpack_row(row)
    abstract_list.append(values)
abstract_df = pd.DataFrame(abstract_list, columns=['book', 'abstract'])
abstract_df.head()

In [None]:
abstract_df.shape

In [None]:
#load question generated from abstracts
import json

with open('data_outputs/abstract.json') as f:
    abstract = json.load(f)

In [None]:
#get similarity between a target type of questions and an input question
def make_similar(reference, input_q):
    q_vec = []
    input_vec = nlp(input_q)

    for question in reference:
        question_vec = nlp(question)
        q_vec.append(input_vec.similarity(question_vec))
        return max(q_vec)
        

In [None]:
#get generated question
question_list = [question for qg in abstract.values() for question in qg]


In [None]:
#replace entities by their label
delex_questions = []
for question in question_list:
    doc = nlp(question)
    new_question = question
    for ent in reversed(doc.ents):
        start = ent.start_char
        end = start + len(ent.text)
        new_question = new_question[:start]+ ent.label_ + new_question[end:]
    delex_questions.append(new_question)

In [None]:
new_df = pd.DataFrame(zip(question_list, delex_questions), columns = ['question', 'delex_question'])
new_df.head(20)

In [None]:
date_target = ['In date was published','in date was first published', 'when was the book published', 'when was the book released', 'when was work of art first published', 'in date was originally published', 'when was the novel published', "what is the released year","when was oublished"]
date_similarity = [make_similar(date_target, input_) for input_ in new_df['delex_question']]


In [None]:
new_df['date_similarity'] = date_similarity
date_df = list(set(new_df['question'][new_df['date_similarity']>0.90].to_list()))

In [None]:

others_date = [q for q in date_df for w in q.split() if w.lower() in ['where', 'who', 'many','which', 'country', 'city']]
date_df = [q for q in date_df if q not in others_date]

In [None]:
len(date_df)

In [None]:
date_df = pd.DataFrame(date_df, columns=['Query'])
date_df['pred'] = 'date'

In [None]:
simple_lang = simple_df_book['Query'][simple_df_book['FreebasePredicate']=='language']
lang_target_questions = []
for question in simple_lang:
    doc = nlp(question)
    new_question = question
    for ent in reversed(doc.ents):
        start = ent.start_char
        end = start + len(ent.text)
        new_question = new_question[:start]+ ent.label_ + new_question[end:]
    lang_target_questions.append(new_question)

In [None]:
#lang_question = simple_df_book['Query'][simple_df_book['FreebasePredicate']=='language'].to_list()
lang_question = [ "what language was book ", "in which language did the book appear","what is the original language of the book", "in what language was realeased", "in what language was originally published", "what language is book written", "what language was book originally?","what language was the original edition?","what language was the novel written in?"]
new_df['lang_similarity'] = [make_similar(lang_question, input_) for input_ in new_df['delex_question']]

In [None]:
lang_list = new_df['question'][new_df['lang_similarity']>0.90].to_list()
others = [q for q in lang_list for w in q.split() if w.lower() in ['where', 'who', 'how','many', 'when', 'country']]
lang_df = [q for q in lang_list if q not in others]

In [None]:
lang_df = lang_df['Query'].to_list()[:-3]

In [None]:
lang_df = pd.DataFrame(lang_df, columns=['Query'])
lang_df['pred'] = 'language'

In [None]:
country_df = pd.DataFrame(simple_df['Query'][simple_df['FreebasePredicate'] == 'www.freebase.com/film/film/country'])
country_df['pred'] = 'country'


In [None]:
fig = plt.figure(figsize=(10,6))
plt.hist([make_similar(date_questions, input_) for input_ in country_df['Query']], bins= 15)
fig.savefig('images/country.png')

In [None]:
intent_simple_df = simple_df_book[['Query', 'FreebasePredicate']]
intent_simple_df.columns = ['Query', 'pred']

In [None]:
final_df = pd.concat([intent_simple_df,  date_df])

In [None]:
intent_simple_df['pred'].value_counts()

In [None]:
final_df['pred'].value_counts()

In [None]:
#distribution plot of filtered dataset -> change to complete dataset
fig = plt.figure(figsize=(12,6))
ax = final_df['pred'].value_counts().sort_index(ascending=True)
ax.plot(kind='bar')
fig.savefig('images/data_book_final.png', bbox_inches = 'tight')

In [None]:
final_df

In [None]:
np.mean([len(question) for question in final_df['Query']])

In [None]:
#question length for subset dataset -> change to complete dataset
fig = plt.figure(figsize=(10,6))
plt.hist([len(question.split()) for question in final_df['Query']], bins= 50)
fig.savefig('len_question_book.png')

In [None]:
final_df['pred'].value_counts()

In [None]:
final_df.to_csv('data_outputs/new_intent_df.csv', index = False)

In [None]:
intent_simple_df.to_csv('data_outputs/simple_book_df.csv', index=False)

In [None]:
intent_simple_df['pred'].value_counts()

# NEL

In [None]:
def NEL_spotlight(question, support):

    question_annotations = []
    try: 
        annotations = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate', question, confidence=0.4, support=20) 
        question_annotations.append(annotations[0]['URI']) 
    except:
        pass

    return question_annotations[0]

In [None]:
#https://github.com/ram-g-athreya/RNN-Question-Answering/blob/master/slot_filling.ipynb
def tagme_annotation(token, question):
    ann_list = []
    response = requests.get("https://tagme.d4science.org/tagme/tag?lang=en&gcube-token={}&text={}".format(token, question))

    annotations = {}
    if response.status_code == 200 :
      for annotation in json.loads(response.text)['annotations']: 
             
        annotations[('http://dbpedia.org/resource/' + annotation['title'].replace(' ', '_'))] = annotation['rho']
    else: 
      annotations.append('')

    return sorted(annotations.items(), key=lambda x: x[1])[-1]

In [None]:
nel_df = pd.read_csv('data_outputs/nel_data.csv')
nel_df.head()

In [None]:
nel_df['spot0.3'] = nel_df['spot0.3'].replace(np.nan, '', regex=True)
nel_df['spot0.4'] = nel_df['spot0.4'].replace(np.nan, '', regex=True)
nel_df['spot0.5'] = nel_df['spot0.5'].replace(np.nan, '', regex=True)
nel_df['spot0.6'] = nel_df['spot0.6'].replace(np.nan, '', regex=True)

In [None]:
nel_df.isna().sum()

In [None]:

def qa_pred (data, pred):
    true = []
    not_ent = []
    for i in range(len(data)):
        if pred[i] == '':
            true.append(0.0)
            not_ent.append(1.0)
        elif pred[i] == data[i]:
            true.append(1.0)
        else:
            true.append(0.0)

    fp = len(data)-sum(not_ent)-sum(true)
    fn = len(data)-sum(true)
    recall = sum(true)/(sum(true)+fn)
    precision = sum(true)/(sum(true)+fp)
    f1 = 2*((precision*recall)/(precision+recall))
    return recall, precision, f1

In [None]:
qa_pred(nel_df['Subject'], nel_df['spot0.3'])

In [None]:
qa_pred(nel_df['Subject'], nel_df['tagme'])

In [None]:
fig, ((ax1,ax2), (ax3, ax4))= plt.subplots(nrows=2, ncols=2, figsize=(12,5))
ax1.hist(nel_df['tagme_rho'][nel_df['spot0.3']== '' ], bins= 15)
ax1.set_title('Tagme goodness DBpedia Spotlight confidence of 0.3')
ax2.hist(nel_df['tagme_rho'][nel_df['spot0.4']== '' ], bins= 15)
ax2.set_title('Tagme goodness DBpedia Spotlight confidence of 0.4')
ax3.hist(nel_df['tagme_rho'][nel_df['spot0.5']== '' ], bins= 15)
ax3.set_title('Tagme goodness vs DBpedia Spotlight confidence of 0.5')
ax4.hist(nel_df['tagme_rho'][nel_df['spot0.6']== '' ], bins= 15)
ax4.set_title('Tagme goodness DBpedia Spotlight confidence of 0.6')
plt.tight_layout()
git st

In [None]:
fig, ax= plt.subplots(1,1, figsize=(10,5))
ax.hist(nel_df['tagme_rho'][nel_df['spot0.3']== '' ], bins= 15)
ax.set_title('Tagme goodness for DBpedia Spotlight annotations withc a confidence score of 0.3')
fig.savefig('images/dbpedia.png')

In [None]:
nel_df[nel_df['spot0.6']=='']

# Intent recognition

In [None]:
final_df = pd.read_csv('data_outputs/final_intent_df.csv')

In [None]:
final_df['pred'].value_counts()

In [None]:
#load libraries for BERT
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
vocab_size = tokenizer.vocab_size
max_len = max([len(question.split()) for question in final_df['Query']])

In [None]:
max_len

In [None]:
#sequence embedding with BERT
input_ids = []
attention_masks = []

for question in final_df['Query']:
  enc_question = tokenizer.encode_plus(question, add_special_tokens= True, max_length=max_len, pad_to_max_length = True, return_attention_mask = True)
  input_ids.append(enc_question['input_ids'])
  attention_masks.append(enc_question['attention_mask'])

input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)


In [None]:
encoder = LabelEncoder()
intent_val = encoder.fit_transform(final_df['pred'])

In [None]:
set(intent_val)

In [None]:
#split data in train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test, train_mask, val_mask = train_test_split(input_ids , intent_val, attention_masks , random_state=42, test_size = 0.2)

In [None]:
vocab_size = tokenizer.vocab_size +1 
embedding_dim = 128

In [None]:
#model without class weights
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_len),
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(8, activation = 'softmax')], name = 'LSTM')
  

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(lr=2e-4), metrics=['acc'])
model.summary()

In [None]:
history = model.fit(x_train, y_train,epochs = 10, validation_data=(x_test, y_test), verbose = 1, batch_size = 32)

In [None]:
#model with class weights
import tensorflow as tf
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_len),
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(8, activation = 'softmax')], name = 'LSTM')
  

lstm_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(lr=2e-4), metrics=['acc'])
lstm_model.summary()

In [None]:
from sklearn.utils import class_weight
class_weight = list(class_weight.compute_class_weight('balanced', np.unique(final_df['pred']), final_df['pred']))

In [None]:
final_df['pred'].value_counts()

In [None]:
weights = {}
for idx, weight in enumerate(class_weight):
    weights[idx] = weight

In [None]:
weights

In [None]:
history_weights = lstm_model.fit(x_train, y_train,epochs = 10, validation_data=(x_test, y_test), verbose = 1, batch_size = 8, class_weight=weights)

In [None]:
#plot accuracy
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))
ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
ax1.set_title('model accuracy for LSTM without weights')
ax1.legend(['train','val'], loc='right')
ax1.set_ylabel('accuracy')
ax1.set_xlabel('epoch')

ax2.plot(history_weights.history['acc'])
ax2.plot(history_weights.history['val_acc'])
ax2.set_title('model accuracy for LSTM with class weights')
ax2.legend(['train','val'], loc='right')
ax2.set_ylabel('accuracy')
ax2.set_xlabel('epoch')
fig.savefig('images/lstm_accuracy.jpeg')


In [None]:
#plot loss
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))
ax1.plot(history.history['loss'])
ax1.plot(history.history['val_loss'])
ax1.set_title('Loss for LSTM without weights')
ax1.legend(['train','val'], loc='right')
ax1.set_ylabel('loss')
ax1.set_xlabel('epoch')

ax2.plot(history_weights.history['loss'])
ax2.plot(history_weights.history['val_loss'])
ax2.set_title('Loss for LSTM with class weights')
ax2.legend(['train','val'], loc='right')
ax2.set_ylabel('loss')
ax2.set_xlabel('epoch')
fig.savefig('images/lstm_loss.jpeg')

In [None]:
#prediction of test cases
from sklearn.metrics import classification_report
preds = model.predict_classes(x_test)
print(classification_report(y_test, preds,target_names = encoder.inverse_transform([i for i in range(8)])))


In [None]:
from sklearn.metrics import classification_report
preds = lstm_model.predict_classes(x_test)
print(classification_report(y_test, preds,target_names = encoder.inverse_transform([i for i in range(8)])))


In [None]:
from keras.utils import to_categorical
question_test  = "which type of book is alice in wonderland "
q_test_enc = tokenizer.encode(question_test, add_special_tokens= True)



In [None]:

preds = lstm_model.predict_classes(q_test_enc)
prediction_ = np.argmax(preds, axis=-1)
encoder.inverse_transform([prediction_])

In [None]:
y_pred = model.predict_classes(x_test)

In [None]:
#save model
model.save('models/bilstm')

In [None]:
bilstm_model = tf.keras.models.load_model('bilstm')

In [None]:
  question = "what is the language of Moby dick"
  enc_question = tokenizer.encode_plus(question, add_special_tokens= True, max_length=24, pad_to_max_length = True, return_attention_mask = True)


In [None]:
preds = bilstm_model.predict(enc_question)

prediction_ = np.argmax(preds, axis=-1)
encoder.inverse_transform([0])

