#71 #65 (#73)

Small fixes
cdqa-suite · Mar 9, 2019 · 009caff · 009caff
1 parent 05a49ea
commit 009caff
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ pip install .
 │   │   └── train.py --> trains a model given a input dataset already processed
 │   ├── reader
 │   │   ├── __init__.py
+│   │   ├── bertqa_sklearn.py --> A BertForQuestionAnswering sklearn wrapper based on run_squad.py's main() function
 │   │   └── run_squad.py --> a miror of pytorch-pretrained-BERT example (used for pipeline steps)
 │   ├── retriever
 │   │   ├── __init__.py
@@ -61,13 +62,26 @@ pip install .
 
 ## Getting started
 
-Download existing data and models with the `download.py` script:
+To download existing data and models automatically from the Github releases, you will need a personal Github token. You can find [how to create one here.](https://github.com/settings/tokens). You only need to select the `repo` scope.
 
 ```shell
 export token='YOUR_GITHUB_TOKEN'
+```
+
+You can now execute the `download.py` to get all Github release assets:
+
+```shell
 python cdqa/pipeline/download.py
 ```
 
+In order to accelerate training and prediction time, you will need to install [`apex`](https://github.com/nvidia/apex):
+
+```shell
+git clone https://github.com/NVIDIA/apex.git
+cd apex/
+python setup.py install --cuda_ext --cpp_ext
+```
+
 You can now execute the [`examples`](examples) or the [`pipeline`](cdqa/pipeline) steps to use the application.
 
 ## Contributing

diff --git a/cdqa/pipeline/predict.py b/cdqa/pipeline/predict.py
@@ -1,3 +1,4 @@
+import os
 import pandas as pd
 from ast import literal_eval
 from joblib import load
@@ -25,9 +26,9 @@
                                          article_indices=article_indices,
                                          metadata=df)
 
-test_processor = BertProcessor(bert_model='bert-base-uncased', is_training=False)
+test_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=False)
 test_examples, test_features = test_processor.fit_transform(X=squad_examples)
-model = load('model.joblib') 
+model = load(os.path.join('models/bert_qa_squad_v1.1_sklearn', 'bert_qa_squad_v1.1_sklearn.joblib'))
 predictions = model.predict(X=test_features)
 
 print(question)

diff --git a/cdqa/pipeline/train.py b/cdqa/pipeline/train.py
@@ -1,3 +1,4 @@
+import os
 import pandas as pd
 from ast import literal_eval
 from joblib import dump
@@ -15,16 +16,17 @@
 dump(article_tfidf_matrix, 'models/article_tfidf_matrix.joblib')
 
 # train document reader
-train_processor = BertProcessor(bert_model='bert-base-uncased', is_training=True)
+train_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=True)
 train_examples, train_features = train_processor.fit_transform(X='data/train-v1.1.json')
 
 model = BertQA(bert_model='bert-base-uncased',
-               custom_weights=False,
                train_batch_size=12,
                learning_rate=3e-5,
                num_train_epochs=2,
-               output_dir='logs/bert_qa_squad_v1.1_sklearn')
+               do_lower_case=True,
+               fp16=True,
+               output_dir='models/bert_qa_squad_v1.1_sklearn')
 
 model.fit(X=(train_examples, train_features))
 
-dump(model, 'model.joblib')
+dump(model, os.path.join(model.output_dir, 'bert_qa_squad_v1.1_sklearn.joblib'))
diff --git a/cdqa/reader/bertqa_sklearn.py b/cdqa/reader/bertqa_sklearn.py
@@ -775,7 +775,7 @@ class BertProcessor(BaseEstimator, TransformerMixin):
 
     def __init__(self,
                  bert_model,
-                 do_lower_case=True,
+                 do_lower_case=False,
                  is_training=False,
                  version_2_with_negative=False,
                  max_seq_length=384,
@@ -814,7 +814,7 @@ class BertQA(BaseEstimator):
 
     def __init__(self,
                  bert_model,
-                 custom_weights=True,
+                 custom_weights=False,
                  train_batch_size=32,
                  predict_batch_size=8,
                  learning_rate=5e-5,
@@ -826,9 +826,9 @@ def __init__(self,
                  no_cuda=False,
                  seed=42,
                  gradient_accumulation_steps=1,
-                 do_lower_case=True,
+                 do_lower_case=False,
                  local_rank=-1,
-                 fp16=True,
+                 fp16=False,
                  loss_scale=0,
                  version_2_with_negative=False,
                  null_score_diff_threshold=0.0,