Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
armancohan committed Apr 2, 2019
1 parent 1c2fda5 commit e144288
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 41 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,19 @@ Use pip to install dependencies in your desired python environment
Download one of the pre-trained models and run the following command:

```bash
allennlp predict [path-to-data.jsonl] [path-to-model.tar.gz] \
--predictor predictor_aclarc \
allennlp predict [path-to-model.tar.gz] [path-to-data.jsonl] \
--predictor [predictor-type] \
--include-package scicite \
--output-file [out-path.jsonl]
--overrides "{'model':{'data_format':''}}"
```

Where
* `[path-to-data.jsonl]` contains the data in the same format as the training data.
* `[path-to-model.tar.gz]` is the path to the pretrained model
* `[predictor-type]` is one of `predictor_scicite` (for the SciCite dataset format) or `predictor_aclarc` (for the ACL-ARC dataset format).
* `--output-file [out-path.jsonl]` is an optional argument showing the path to the output. If you don't pass this, the output will be printed in the stdout.

You need to convert your data to be according to the training data.
If you are using your own data, you need to first convert your data to be according to the SciCite data format.

#### Pretrained models

Expand Down Expand Up @@ -106,7 +107,7 @@ Where the model output and logs will be stored in `[path-to-serialization-dir/]`

## Citing

If you found our dataset, or code useful, please cite our NAACL 2019 paper:
If you found our dataset, or code useful, please cite [Structural Scaffolds for Citation Intent Classification in Scientific Publications](https://arxiv.org).

```
@InProceedings{Cohan2019Structural,
Expand Down
9 changes: 7 additions & 2 deletions experiment_configs/aclarc-experiment.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
local stringToBool(s) =
if s == "true" then true
else if s == "false" then false
else error "invalid boolean: " + std.manifestJson(s);

local DIM =
if std.extVar("elmo")) then
if stringToBool(std.extVar("elmo")) then
1124
else:
else
100;
{
"random_seed": std.extVar("SEED"),
Expand Down
9 changes: 7 additions & 2 deletions experiment_configs/scicite-experiment.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
local stringToBool(s) =
if s == "true" then true
else if s == "false" then false
else error "invalid boolean: " + std.manifestJson(s);

local DIM =
if std.extVar("elmo")) then
if stringToBool(std.extVar("elmo")) then
1224
else:
else
200;
{
"random_seed": std.extVar("SEED"),
Expand Down
3 changes: 2 additions & 1 deletion scicite/models/scaffold_bilstm_attention_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,9 @@ def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
argmax_indices = np.argmax(predictions, axis=-1)
labels = [self.vocab.get_token_from_index(x, namespace="labels")
for x in argmax_indices]
output_dict['class_probs'] = class_probabilities
output_dict['probabilities'] = class_probabilities
output_dict['positive_labels'] = labels
output_dict['prediction'] = labels
citation_text = []
for batch_text in output_dict['citation_text']:
citation_text.append([self.vocab.get_token_from_index(token_id.item()) for token_id in batch_text])
Expand Down
17 changes: 4 additions & 13 deletions scicite/predictors/predictor_acl_arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,15 @@ def predict_json(self, inputs: JsonDict) -> JsonDict:
intent=citation.intent,
citing_paper_id=citation.citing_paper_id,
cited_paper_id=citation.cited_paper_id,
citation_excerpt_index=citation.citation_excerpt_index,
citation_id=citation.citation_id
citation_excerpt_index=citation.citation_excerpt_index
)
outputs = self._model.forward_on_instance(instance)
predictions = {}

label_to_index = {v: k for k, v in outputs['all_labels'].items()}
for i, prob in enumerate(outputs['class_probs']):
predictions[outputs['all_labels'][i]] = prob

label = max(predictions.items(), key=operator.itemgetter(1))[0]
return_dict['citation_id'] = citation.citation_id
return_dict['citingPaperId'] = outputs['citing_paper_id']
return_dict['citedPaperId'] = outputs['cited_paper_id']
return_dict['probabilities'] = predictions
return_dict['prediction'] = label
return_dict['original_label'] = citation.intent
# return_dict['attention_dist'] = outputs['attn_dist']
return_dict['probabilities'] = outputs['probabilities']
return_dict['prediction'] = outputs['prediction']
return return_dict

@overrides
Expand All @@ -56,7 +47,7 @@ def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use
If you don't want your outputs in JSON-lines format
you can override this function to output them differently.
"""
keys = ['citation_id', 'prediction', 'probabilities', 'original_label', 'citation_text', 'attention_dist', 'original_citation_text']
keys = ['citation_id', 'prediction', 'probabilities', 'citation_text']
for k in outputs.copy():
if k not in keys:
outputs.pop(k)
Expand Down
22 changes: 7 additions & 15 deletions scicite/predictors/predictor_scicite.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,15 @@ def predict_json(self, inputs: JsonDict) -> JsonDict:
citation_excerpt_index=citation.citation_excerpt_index
)
outputs = self._model.forward_on_instance(instance)
predictions = {}

label_to_index = {v: k for k, v in outputs['all_labels'].items()}
for i, prob in enumerate(outputs['class_probs']):
predictions[outputs['all_labels'][i]] = prob

label = max(predictions.items(), key=operator.itemgetter(1))[0]
return_dict['citation_id'] = citation.citation_id
return_dict['citingPaperId'] = outputs['citing_paper_id']
return_dict['citedPaperId'] = outputs['cited_paper_id']
return_dict['citingPaperId'] = outputs.get('citing_paper_id')
return_dict['citedPaperId'] = outputs.get('cited_paper_id')
return_dict['citation_id'] = citation.citation_id
return_dict['probabilities'] = predictions
return_dict['prediction'] = label
return_dict['probabilities'] = outputs.get('probabilities')
return_dict['prediction'] = outputs['prediction']
return_dict['original_label'] = citation.intent
return_dict['citation_text'] = outputs['citation_text']
return_dict['original_citation_text'] = citation.text
return_dict['attention_dist'] = outputs['attn_dist']
return_dict['citation_text'] = outputs.get('citation_text')
return_dict['attention_dist'] = outputs.get('attn_dist')
return return_dict

@overrides
Expand All @@ -65,7 +57,7 @@ def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use
If you don't want your outputs in JSON-lines format
you can override this function to output them differently.
"""
keys = ['citedPaperId', 'citingPaperId', 'excerptCitationIntents']
keys = ['citedPaperId', 'citingPaperId', 'excerptCitationIntents', 'prediction']
for k in outputs.copy():
if k not in keys:
outputs.pop(k)
Expand Down
2 changes: 1 addition & 1 deletion tests/models/model_test_aclarc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

sys.path.append(str(Path('.').absolute()))

from scicite.allennlp_modules.train_multitask_two_tasks import train_model_from_file
from scicite.training.train_multitask_two_tasks import train_model_from_file
from scicite.constants import root_path
from scicite.models.scaffold_bilstm_attention_classifier import ScaffoldBilstmAttentionClassifier
from scicite.dataset_readers.citation_data_reader_scicite import SciciteDatasetReader
Expand Down
2 changes: 1 addition & 1 deletion tests/models/model_test_scicite.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

sys.path.append(str(Path('.').absolute()))

from scicite.allennlp_modules.train_multitask_two_tasks import train_model_from_file
from scicite.training.train_multitask_two_tasks import train_model_from_file
from scicite.constants import root_path
from scicite.models.scaffold_bilstm_attention_classifier import ScaffoldBilstmAttentionClassifier
from scicite.dataset_readers.citation_data_reader_scicite import SciciteDatasetReader
Expand Down
2 changes: 1 addition & 1 deletion tests/predictors/aclarc_predictor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sys
sys.path.append(str(Path('.').absolute()))

from scicite.allennlp_modules.train_multitask_two_tasks import train_model_from_file
from scicite.training.train_multitask_two_tasks import train_model_from_file
from scicite.constants import root_path

sys.path.append(root_path)
Expand Down

0 comments on commit e144288

Please sign in to comment.