In [1]:
import os
from ast import literal_eval
import pandas as pd
from joblib import load

from cdqa.retriever.tfidf_doc_ranker import train_document_retriever
from cdqa.utils.converter import filter_paragraphs

In [2]:
df = pd.read_csv('../data/bnpp_newsroom_v1.0/bnpp_newsroom-v1.0.csv', converters={'paragraphs': literal_eval})

df['paragraphs'] = df['paragraphs'].apply(filter_paragraphs)
df['content'] = df['paragraphs'].apply(lambda x: ' '.join(x))

In [3]:
df.head()

Unnamed: 0,date,title,category,link,abstract,paragraphs,content
0,18.02.2019,Corporate philanthropy & Scientific research: ...,Corporate philanthropy,https://group.bnpparibas/en/news/corporate-phi...,"On Monday, February 11th 2019, the BNP Paribas...","[On Monday, February 11th 2019, the BNP Pariba...","On Monday, February 11th 2019, the BNP Paribas..."
1,15.02.2019,Notification by the ECB of the 2018 Supervisor...,Press release,https://group.bnpparibas/en/press-release/noti...,,[BNP Paribas has received the notification by ...,BNP Paribas has received the notification by t...
2,15.02.2019,7 days of Economics: Foreign versus domestic d...,Economy,https://group.bnpparibas/en/news/7-days-econom...,,"[7 days of Economics: Eurozone, what does weak...","7 days of Economics: Eurozone, what does weake..."
3,13.02.2019,"Portrait of Jann Gallois, bold choregrapher",Corporate philanthropy,https://group.bnpparibas/en/news/portrait-jann...,"The choreographer Jann Gallois, supported by t...",[Corporate philanthropy & Scientific research:...,Corporate philanthropy & Scientific research: ...
4,12.02.2019,Participate in the 2nd edition of Roland-Garro...,Tennis,https://group.bnpparibas/en/news/participate-2...,"On 12th of February 2019, Roland Garros, BNP P...",[BNP Paribas Young Talent Team: the first ten ...,BNP Paribas Young Talent Team: the first ten y...


In [4]:
article_vectorizer, article_tfidf_matrix = train_document_retriever(corpus=df['content'])

In [5]:
from cdqa.utils.converter import generate_squad_examples
from cdqa.retriever.tfidf_doc_ranker import predict_document_retriever
from cdqa.reader.bertqa_sklearn import BertProcessor, BertQA



In [6]:
question = 'Since when does the the Excellence Program of BNP Paribas exist?'

In [7]:
article_indices = predict_document_retriever(question=question,
                                             paragraphs=None,
                                             vectorizer=article_vectorizer,
                                             tfidf_matrix=article_tfidf_matrix,
                                             top_n=3,
                                             metadata=df,
                                             verbose=True)

+------+-------+--------------------------------------------------------------------+
| rank | index |                               title                                |
+------+-------+--------------------------------------------------------------------+
|  1   |  788  | Green is the new blue: SRI news by BNP Paribas Investment Partners |
|  2   |  408  |        BNP Paribas’ commitment to universities and schools         |
|  3   |  2419 |        Federer challenges Stan to remain in big time (AFP)         |
+------+-------+--------------------------------------------------------------------+
Time: 0.01145 seconds


In [8]:
squad_examples = generate_squad_examples(question=question,
                                         article_indices=article_indices,
                                         metadata=df)

3it [00:00, 2192.91it/s]


In [9]:
test_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=False)

In [10]:
%%time
test_examples, test_features = test_processor.fit_transform(X=squad_examples)

CPU times: user 496 ms, sys: 8 ms, total: 504 ms
Wall time: 797 ms


In [11]:
model = load(os.path.join('../models/bert_qa_squad_v1.1_sklearn', 'bert_qa_squad_v1.1_sklearn.joblib'))

# tuning model parameters for predictions
model.output_dir = '../logs/bert_qa_squad_v1.1_sklearn'
model.max_seq_length = 128
model.predict_batch_size = 128

In [12]:
%%time
final_prediction, all_predictions, all_nbest_json, scores_diff_json = model.predict(X=(test_examples, test_features))

HBox(children=(IntProgress(value=0, description='Evaluating', max=2, style=ProgressStyle(description_width='in…


CPU times: user 2.43 s, sys: 600 ms, total: 3.03 s
Wall time: 3.03 s


In [13]:
print('question: {}'.format(question))
print('answer: {}'.format(final_prediction))

question: Since when does the the Excellence Program of BNP Paribas exist?
answer: January 2016
