Code and data associated with EACL 2021 paper "Process-Level Representation of Scientific Protocols with Interactive Annotation".
@inproceedings{tamari-etal-2021-process-level,
title = "Process-Level Representation of Scientific Protocols with Interactive Annotation",
author = "Tamari, Ronen and
Bai, Fan and
Ritter, Alan and
Stanovsky, Gabriel",
booktitle = "Proceedings of the 16th Conference of the {E}uropean Chapter of the Association for Computational Linguistics: Volume 1, Long Papers",
year = "2021",
publisher = "Association for Computational Linguistics",
}
git clone https://github.com/bflashcp3f/textlabs-xwlp-code.git
cd textlabs-xwlp-code
conda env create -f environment.yml
data/xwlp.json
is our experimental data generated by pre-processing 279 AMR files of the X-WLP (eXecutable Wet Labs Protocols) dataset
following the SciERC format.
data/cross_validation.json
contains the data split of 5-fold cross validation used in our experiments.
The checkpoint of scibert-scivocab-uncased
can be downloaded here, and
the WLP dataset is available here (the version we use is an older version).
# Training / Evaluation
export WLP_PATH=<WLP_PATH>
export XWLP_PATH=<XWLP_PATH>
export SCIBERT_PATH=<SCIBERT_PATH>
export OUTPUT_DIR=<OUTPUT_DIR>
python code/pipeline_men_iden.py \
--wlp_path $WLP_PATH \
--xwlp_path $XWLP_PATH \
--lm $SCIBERT_PATH \
--max_len 512 \
--batch_size 16 \
--gpu_ids 0,1,2,3 \
--learning_rate 2e-5 \
--epochs 5 \
--output_dir $OUTPUT_DIR/models/pipeline_ner \
--save_model
# Inference
python code/pipeline_men_iden_infer.py \
--model_path $OUTPUT_DIR/models/pipeline_mi \
--xwlp_path $XWLP_PATH \
--max_len 512 \
--batch_size 16 \
--gpu_ids 0,1,2,3 \
--output_dir $OUTPUT_DIR/predictions/pipeline_mi
# Training
python code/pipeline_pred_ground.py \
--xwlp_path $XWLP_PATH \
--lm $SCIBERT_PATH \
--max_len 512 \
--batch_size 16 \
--gpu_ids 0,1,2,3 \
--learning_rate 2e-5 \
--epochs 20 \
--output_dir $OUTPUT_DIR/models/pipeline_pred_ground \
--save_model
# Inference
python code/pipeline_pred_ground_infer.py \
--model_path $OUTPUT_DIR/models/pipeline_pred_ground \
--data_path $OUTPUT_DIR/predictions/pipeline_mi \
--max_len 512 \
--batch_size 16 \
--gpu_ids 0,1,2,3 \
--output_dir $OUTPUT_DIR/predictions/pipeline_pred_ground
# Evaluation
python code/pipeline_pred_ground_eval.py \
--gold_input_path $OUTPUT_DIR/models/pipeline_pred_ground \
--pred_input_path $OUTPUT_DIR/predictions/pipeline_pred_ground
# Training
python code/pipeline_re.py \
--xwlp_path $XWLP_PATH \
--lm $SCIBERT_PATH \
--max_len 512 \
--batch_size 36 \
--gpu_ids 0,1,2,3 \
--learning_rate 2e-5 \
--epochs 5 \
--output_dir $OUTPUT_DIR/models/pipeline_re \
--save_model
# Inference
python code/pipeline_re_infer.py \
--xwlp_path $XWLP_PATH \
--model_path $OUTPUT_DIR/models/pipeline_re \
--data_path $OUTPUT_DIR/predictions/pipeline_pred_ground \
--max_len 512 \
--batch_size 128 \
--gpu_ids 0,1,2,3 \
--output_dir $OUTPUT_DIR/predictions/pipeline_re
# Evaluation
python code/pipeline_re_eval.py \
--xwlp_path data/xwlp.json \
--gold_input_pred_path $OUTPUT_DIR/models/pipeline_re \
--pred_input_pred_path $OUTPUT_DIR/predictions/pipeline_re
We adapt the code of DyGIE++ (Wadden et al., 2019) for experiments.
# Clone the DyGIE++ repo
cd ..
git clone https://github.com/dwadden/dygiepp.git
cd dygiepp
git checkout f59dcf7ccd
# Create conda env
conda create --name dygiepp python=3.7
conda activate dygiepp
pip install -r requirements.txt
# Download SciBERT checkpoint
python scripts/pretrained/get_scibert.py
# Copy scripts to right paths
cd ../textlabs-xwlp-code
cp scripts/train_xwlp_sliding.sh ../dygiepp/scripts/train/
cp scripts/predict_xwlp_sliding.sh ../dygiepp/scripts/train/
cp config/xwlp_sliding.jsonnet ../dygiepp/training_config/
# Generate data for experimetns of Multi-Task model
export MULTI_TASK_DATA=<MULTI_DATA>
python code/preprocess/generate_multi-task_data.py \
--xwlp_path $XWLP_PATH \
--window_size 5 \
--seq_len_limit 150 \
--output_dir $MULTI_TASK_DATA
cd ../dygiepp
export MULTI_TASK_MODEL=<MULTI_TASK_MODEL>
sh scripts/train/train_xwlp_sliding.sh $MULTI_TASK_DATA $MULTI_TASK_MODEL
sh scripts/train/predict_xwlp_sliding.sh $MULTI_TASK_MODEL $MULTI_TASK_DATA
cd ../textlabs-xwlp-code
# Predicate Grounding
python code/multi-task_pred_ground_eval.py \
--gold_path $MULTI_TASK_DATA \
--pred_path $MULTI_TASK_MODEL
# Argument Role Labeling + Temporal Ordering (RE)
python code/multi-task_re_eval.py \
--gold_path $MULTI_TASK_DATA \
--pred_path $MULTI_TASK_MODEL
If you have any questions or suggestions, feel free to email me at
<fan.bai@cc.gatech.edu>
or create a Github issue.
[Wadden et al., 2019] Entity, Relation, and Event Extraction with Contextualized Span Representations