From 3932d69ec80e0590b8657de9abbd179db150a3be Mon Sep 17 00:00:00 2001 From: fmikaelian <39884124+fmikaelian@users.noreply.github.com> Date: Sun, 28 Apr 2019 11:51:24 +0200 Subject: [PATCH 1/4] Add github badges #87 --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9fd6464..e450864 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # cdQA [![Build Status](https://travis-ci.com/fmikaelian/cdQA.svg?token=Vzy9RRKRZ41ynd9q2BRX&branch=develop)](https://travis-ci.com/fmikaelian/cdQA) [![codecov](https://codecov.io/gh/fmikaelian/cdQA/branch/develop/graph/badge.svg?token=F16X0IU6RT)](https://codecov.io/gh/fmikaelian/cdQA) +[![PyPI Downloads](https://img.shields.io/pypi/v/tensorflow.svg)](https://pypi.org/project/tensorflow/) +[![PyPI Version](https://img.shields.io/pypi/dm/tensorflow.svg)](https://pypi.org/project/tensorflow/) [![Binder](https://mybinder.org/badge.svg)]() [![Colab](https://colab.research.google.com/assets/colab-badge.svg)]() - [![License]( -https://img.shields.io/badge/License-MIT-yellow.svg)](https://choosealicense.com/licenses/mit/) +[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://choosealicense.com/licenses/mit/) An end-to-end closed-domain question answering system with BERT and classic IR methods 📚 @@ -170,7 +171,6 @@ python cdqa/pipeline/download.py The data is saved in `/data` and the models in `/models`. You can load the models with `joblib.load()`. - ### Practical examples A complete worfklow is described in our [`examples`](examples) notebook. From 3311885988c7fbeca12dfc0b640ee22d7367fe26 Mon Sep 17 00:00:00 2001 From: fmikaelian <39884124+fmikaelian@users.noreply.github.com> Date: Sun, 28 Apr 2019 11:58:04 +0200 Subject: [PATCH 2/4] Disable verbose during predictions #103 --- cdqa/reader/bertqa_sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdqa/reader/bertqa_sklearn.py b/cdqa/reader/bertqa_sklearn.py index 5c11fab..5cf0dde 100644 --- a/cdqa/reader/bertqa_sklearn.py +++ b/cdqa/reader/bertqa_sklearn.py @@ -1186,7 +1186,7 @@ def predict(self, X): self.model.eval() all_results = [] logger.info("Start evaluating") - for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=self.local_rank not in [-1, 0]): + for input_ids, input_mask, segment_ids, example_indices in eval_dataloader: if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(self.device) From 95931376009f9f3ef6c60fa124ee933967a2685b Mon Sep 17 00:00:00 2001 From: fmikaelian <39884124+fmikaelian@users.noreply.github.com> Date: Mon, 29 Apr 2019 10:02:36 +0200 Subject: [PATCH 3/4] fix typos and tests #95 --- cdqa/retriever/tfidf_doc_ranker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cdqa/retriever/tfidf_doc_ranker.py b/cdqa/retriever/tfidf_doc_ranker.py index bfade32..d39d76a 100644 --- a/cdqa/retriever/tfidf_doc_ranker.py +++ b/cdqa/retriever/tfidf_doc_ranker.py @@ -55,12 +55,12 @@ class TfidfRetriever(BaseEstimator): """ def __init__(self, + metadata, ngram_range=(1, 2), max_df=0.85, stop_words='english', paragraphs=None, top_n=3, - metadata, verbose=True): self.ngram_range = ngram_range @@ -85,10 +85,10 @@ def predict(self, X): t0 = time.time() question_vector = self.vectorizer.transform([X]) scores = pd.DataFrame(self.tfidf_matrix.dot(question_vector.T).toarray()) - closest_docs_indices = scores.sort_values(by=0, ascending=False).index[:top_n].values + closest_docs_indices = scores.sort_values(by=0, ascending=False).index[:self.top_n].values # inspired from https://github.com/facebookresearch/DrQA/blob/50d0e49bb77fe0c6e881efb4b6fe2e61d3f92509/scripts/reader/interactive.py#L63 - if verbose: + if self.verbose: rank = 1 table = prettytable.PrettyTable(['rank', 'index', 'title']) for i in range(len(closest_docs_indices)): From db3fa6697870a098948adafed85eb27e4e6502cf Mon Sep 17 00:00:00 2001 From: fmikaelian <39884124+fmikaelian@users.noreply.github.com> Date: Mon, 29 Apr 2019 10:06:41 +0200 Subject: [PATCH 4/4] Rename variables and scripts #108 --- cdqa/utils/converter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cdqa/utils/converter.py b/cdqa/utils/converter.py index d589029..9e78b82 100644 --- a/cdqa/utils/converter.py +++ b/cdqa/utils/converter.py @@ -54,7 +54,7 @@ def df2squad(df, squad_version='v2.0', output_dir=None, filename=None): return json_data -def generate_squad_examples(question, article_indices, metadata): +def generate_squad_examples(question, closest_docs_indices, metadata): """ Creates a SQuAD examples json object for a given for a given question using outputs of retriever and document database. @@ -62,7 +62,7 @@ def generate_squad_examples(question, article_indices, metadata): ---------- question : [type] [description] - article_indices : [type] + closest_docs_indices : [type] [description] metadata : [type] [description] @@ -76,7 +76,7 @@ def generate_squad_examples(question, article_indices, metadata): -------- >>> from cdqa.utils.converter import generate_squad_examples >>> squad_examples = generate_squad_examples(question='Since when does the the Excellence Program of BNP Paribas exist?', - article_indices=[788, 408, 2419], + closest_docs_indices=[788, 408, 2419], metadata=df) """ @@ -84,7 +84,7 @@ def generate_squad_examples(question, article_indices, metadata): squad_examples = [] - metadata_sliced = metadata.loc[article_indices] + metadata_sliced = metadata.loc[closest_docs_indices] for index, row in tqdm(metadata_sliced.iterrows()): temp = {'title': row['title'],