Skip to content

Commit

Permalink
#71 #65 (#73)
Browse files Browse the repository at this point in the history
Small fixes
  • Loading branch information
fmikaelian committed Mar 9, 2019
1 parent 05a49ea commit 009caff
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 11 deletions.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pip install .
│ │ └── train.py --> trains a model given a input dataset already processed
│ ├── reader
│ │ ├── __init__.py
│ │ ├── bertqa_sklearn.py --> A BertForQuestionAnswering sklearn wrapper based on run_squad.py's main() function
│ │ └── run_squad.py --> a miror of pytorch-pretrained-BERT example (used for pipeline steps)
│ ├── retriever
│ │ ├── __init__.py
Expand All @@ -61,13 +62,26 @@ pip install .

## Getting started

Download existing data and models with the `download.py` script:
To download existing data and models automatically from the Github releases, you will need a personal Github token. You can find [how to create one here.](https://github.com/settings/tokens). You only need to select the `repo` scope.

```shell
export token='YOUR_GITHUB_TOKEN'
```

You can now execute the `download.py` to get all Github release assets:

```shell
python cdqa/pipeline/download.py
```

In order to accelerate training and prediction time, you will need to install [`apex`](https://github.com/nvidia/apex):

```shell
git clone https://github.com/NVIDIA/apex.git
cd apex/
python setup.py install --cuda_ext --cpp_ext
```

You can now execute the [`examples`](examples) or the [`pipeline`](cdqa/pipeline) steps to use the application.

## Contributing
Expand Down
5 changes: 3 additions & 2 deletions cdqa/pipeline/predict.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import pandas as pd
from ast import literal_eval
from joblib import load
Expand Down Expand Up @@ -25,9 +26,9 @@
article_indices=article_indices,
metadata=df)

test_processor = BertProcessor(bert_model='bert-base-uncased', is_training=False)
test_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=False)
test_examples, test_features = test_processor.fit_transform(X=squad_examples)
model = load('model.joblib')
model = load(os.path.join('models/bert_qa_squad_v1.1_sklearn', 'bert_qa_squad_v1.1_sklearn.joblib'))
predictions = model.predict(X=test_features)

print(question)
Expand Down
10 changes: 6 additions & 4 deletions cdqa/pipeline/train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import pandas as pd
from ast import literal_eval
from joblib import dump
Expand All @@ -15,16 +16,17 @@
dump(article_tfidf_matrix, 'models/article_tfidf_matrix.joblib')

# train document reader
train_processor = BertProcessor(bert_model='bert-base-uncased', is_training=True)
train_processor = BertProcessor(bert_model='bert-base-uncased', do_lower_case=True, is_training=True)
train_examples, train_features = train_processor.fit_transform(X='data/train-v1.1.json')

model = BertQA(bert_model='bert-base-uncased',
custom_weights=False,
train_batch_size=12,
learning_rate=3e-5,
num_train_epochs=2,
output_dir='logs/bert_qa_squad_v1.1_sklearn')
do_lower_case=True,
fp16=True,
output_dir='models/bert_qa_squad_v1.1_sklearn')

model.fit(X=(train_examples, train_features))

dump(model, 'model.joblib')
dump(model, os.path.join(model.output_dir, 'bert_qa_squad_v1.1_sklearn.joblib'))
8 changes: 4 additions & 4 deletions cdqa/reader/bertqa_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ class BertProcessor(BaseEstimator, TransformerMixin):

def __init__(self,
bert_model,
do_lower_case=True,
do_lower_case=False,
is_training=False,
version_2_with_negative=False,
max_seq_length=384,
Expand Down Expand Up @@ -814,7 +814,7 @@ class BertQA(BaseEstimator):

def __init__(self,
bert_model,
custom_weights=True,
custom_weights=False,
train_batch_size=32,
predict_batch_size=8,
learning_rate=5e-5,
Expand All @@ -826,9 +826,9 @@ def __init__(self,
no_cuda=False,
seed=42,
gradient_accumulation_steps=1,
do_lower_case=True,
do_lower_case=False,
local_rank=-1,
fp16=True,
fp16=False,
loss_scale=0,
version_2_with_negative=False,
null_score_diff_threshold=0.0,
Expand Down

0 comments on commit 009caff

Please sign in to comment.