In [1]:
import os
from ast import literal_eval
import pandas as pd
from joblib import load

from cdqa.utils.converter import filter_paragraphs
from cdqa.retriever.tfidf_sklearn import TfidfRetriever
from cdqa.utils.converter import generate_squad_examples
from cdqa.reader.bertqa_sklearn import BertProcessor, BertQA



In [2]:
df = pd.read_csv('../data/bnpp_newsroom_v1.0/bnpp_newsroom-v1.0.csv', converters={'paragraphs': literal_eval})

df['paragraphs'] = df['paragraphs'].apply(filter_paragraphs)
df['content'] = df['paragraphs'].apply(lambda x: ' '.join(x))

In [3]:
df.head()

Unnamed: 0,date,title,category,link,abstract,paragraphs,content
0,18.02.2019,Corporate philanthropy & Scientific research: ...,Corporate philanthropy,https://group.bnpparibas/en/news/corporate-phi...,"On Monday, February 11th 2019, the BNP Paribas...","[On Monday, February 11th 2019, the BNP Pariba...","On Monday, February 11th 2019, the BNP Paribas..."
1,15.02.2019,Notification by the ECB of the 2018 Supervisor...,Press release,https://group.bnpparibas/en/press-release/noti...,,[BNP Paribas has received the notification by ...,BNP Paribas has received the notification by t...
2,15.02.2019,7 days of Economics: Foreign versus domestic d...,Economy,https://group.bnpparibas/en/news/7-days-econom...,,"[7 days of Economics: Eurozone, what does weak...","7 days of Economics: Eurozone, what does weake..."
3,13.02.2019,"Portrait of Jann Gallois, bold choregrapher",Corporate philanthropy,https://group.bnpparibas/en/news/portrait-jann...,"The choreographer Jann Gallois, supported by t...",[Corporate philanthropy & Scientific research:...,Corporate philanthropy & Scientific research: ...
4,12.02.2019,Participate in the 2nd edition of Roland-Garro...,Tennis,https://group.bnpparibas/en/news/participate-2...,"On 12th of February 2019, Roland Garros, BNP P...",[BNP Paribas Young Talent Team: the first ten ...,BNP Paribas Young Talent Team: the first ten y...


In [4]:
question = 'Since when does the the Excellence Program of BNP Paribas exist?'

In [5]:
retriever = TfidfRetriever(metadata=df)
retriever.fit(X=df['content'])
closest_docs_indices = retriever.predict(X=question)

+------+-------+--------------------------------------------------------------------+
| rank | index |                               title                                |
+------+-------+--------------------------------------------------------------------+
|  1   |  788  | Green is the new blue: SRI news by BNP Paribas Investment Partners |
|  2   |  408  |        BNP Paribasâ€™ commitment to universities and schools         |
|  3   |  2419 |        Federer challenges Stan to remain in big time (AFP)         |
+------+-------+--------------------------------------------------------------------+
Time: 0.00823 seconds


In [6]:
%%time
squad_examples = generate_squad_examples(question=question,
                                         closest_docs_indices=closest_docs_indices,
                                         metadata=df)

test_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=False)
test_examples, test_features = test_processor.fit_transform(X=squad_examples)

3it [00:00, 2152.77it/s]


In [9]:
reader = load(os.path.join('../models/bert_qa_squad_v1.1_sklearn', 'bert_qa_squad_v1.1_sklearn.joblib'))

# tuning model parameters for predictions
reader.output_dir = '../logs/bert_qa_squad_v1.1_sklearn'
reader.max_seq_length = 128
reader.predict_batch_size = 128

In [10]:
%%time
final_prediction = reader.predict(X=(test_examples, test_features))

CPU times: user 2.49 s, sys: 536 ms, total: 3.03 s
Wall time: 3.02 s


In [11]:
print('question: {}'.format(question))
print('answer: {}'.format(final_prediction))

question: Since when does the the Excellence Program of BNP Paribas exist?
answer: January 2016
