Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix text NER evaluation #7

Merged
merged 7 commits into from Feb 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
@@ -1,2 +1,7 @@

# general
.DS_Store
dataset/
manifest/
save/
slue_toolkit.egg-info/
__pycache__/
3 changes: 2 additions & 1 deletion baselines/ner/nlp_scripts/eval-deberta.sh
Expand Up @@ -8,4 +8,5 @@ python slue_toolkit/text_ner/ner_deberta.py eval \
--model_type $model_type \
--eval_asr False \
--eval_subset $eval_set \
--eval_label $eval_label
--eval_label $eval_label \
--save_results True
2 changes: 1 addition & 1 deletion baselines/sentiment/README.md
Expand Up @@ -31,6 +31,6 @@ To evaluate the fine-tuned nlp model, run following command or run `baselines/se

First, ASR transcription need to be prepared in manifest dir, and then evalution can be done using the same evaluation script with nlp topline.
```sh
python slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data datasets/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000
python slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data dataset/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000
python slue_toolkit/eval/eval_nlp_sentiment.py --save-dir save/sentiment/nlp_topline_bert-base-cased --data manifest/slue-voxceleb --subset test.asr-pred
```
4 changes: 2 additions & 2 deletions baselines/sentiment/pipeline_scripts/eval.sh
@@ -1,11 +1,11 @@
#!/bin/bash

python3 slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data datasets/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000
python3 slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data dataset/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000

python3 slue_toolkit/eval/eval_nlp_sentiment.py \
--data manifest/slue-voxceleb \
--subset test.asr-pred \
--save-dir save/sentiment/nlp_topline_bert-base-cased \
--use-gpu \
--eval \


8 changes: 4 additions & 4 deletions scripts/download_datasets.sh
@@ -1,12 +1,12 @@
#!/bin/bash

#1. Download
wget https://papers-slue.awsdev.asapp.com/slue-voxceleb_blind.tar.gz -P datasets/
wget https://papers-slue.awsdev.asapp.com/slue-voxpopuli_blind.tar.gz -P datasets/
wget https://papers-slue.awsdev.asapp.com/slue-voxceleb_blind.tar.gz -P dataset/
wget https://papers-slue.awsdev.asapp.com/slue-voxpopuli_blind.tar.gz -P dataset/

#2. Extract
tar -xzvf datasets/slue-voxceleb_blind.tar.gz -C datasets/
tar -xzvf datasets/slue-voxpopuli_blind.tar.gz -C datasets/
tar -xzvf dataset/slue-voxceleb_blind.tar.gz -C dataset/
tar -xzvf dataset/slue-voxpopuli_blind.tar.gz -C dataset/

#3. preprocess

Expand Down
3 changes: 3 additions & 0 deletions setup.py
Expand Up @@ -23,6 +23,9 @@
"fire",
"editdistance",
"soundfile",
"transformers",
"datasets",
"seqeval",
],
entry_points={},
include_package_data=True,
Expand Down
2 changes: 1 addition & 1 deletion slue_toolkit/prepare/prepare_voxceleb.py
Expand Up @@ -106,7 +106,7 @@ def create_split(


def create_manifest(
data_dir="datasets/slue-voxceleb",
data_dir="dataset/slue-voxceleb",
manifest_dir="manifest/slue-voxceleb",
is_blind=True,
):
Expand Down
2 changes: 1 addition & 1 deletion slue_toolkit/prepare/prepare_voxceleb_asr_pred.py
Expand Up @@ -18,7 +18,7 @@ def main():
"--pred-data",
type=str,
required=True,
default="datasets/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000",
default="dataset/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000",
help="Root directory containing voxceleb1_slue data files,"
"This dir should contain audio/ voxceleb1_slue_{finetune,dev,test} folders ",
)
Expand Down
2 changes: 1 addition & 1 deletion slue_toolkit/prepare/prepare_voxpopuli.py
Expand Up @@ -31,7 +31,7 @@ def create_split(


def create_manifest(
data_dir="datasets/slue-voxpopuli",
data_dir="dataset/slue-voxpopuli",
manifest_dir="manifest/slue-voxpopuli",
is_blind=True,
):
Expand Down
11 changes: 7 additions & 4 deletions slue_toolkit/text_ner/ner_deberta.py
Expand Up @@ -38,8 +38,8 @@ def eval(
label_list = read_lst(os.path.join(data_dir, f"{train_label}_tag_lst_ordered"))
if save_results:
ner_results_dir = os.path.join(log_dir, "error_analysis")
os.makedirs(ner_results_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
os.makedirs(ner_results_dir, exist_ok=True)

data_obj = NDM.DataSetup(data_dir, model_type)
_ = data_obj.prep_data(
Expand All @@ -49,17 +49,20 @@ def eval(
"fine-tune", "combined", get_map_files=True
) # prepare tag-id mapping files

# TODO: please verify this function
if "combined" in eval_label:
tag_lst = read_lst(os.path.join(data_dir, "combined_tag_lst_ordered"))

val_texts, val_tags, _, _, _, _ = data_obj.prep_data(eval_subset, "raw")
val_texts, val_tags, _, _, _ = data_obj.prep_data(eval_subset)
if eval_asr:
asr_val_texts, _, _, _, val_dataset = data_obj.prep_data(
f"{eval_subset}-{asr_model_type}-asr-{lm}", "raw"
)
else:
asr_val_texts = None
eval_obj = NDM.Eval(model_dir, model_type, label_list, eval_label, eval_asr)
asr_val_texts, asr_val_dataset = None, None

label_list = read_lst(os.path.join(data_dir, f"{eval_label}_tag_lst_ordered"))
eval_obj = NDM.Eval(data_dir, model_dir, model_type, label_list, eval_label, eval_asr)
for score_type in ["standard", "label"]:
if eval_asr:
res_fn = "-".join(
Expand Down
23 changes: 13 additions & 10 deletions slue_toolkit/text_ner/ner_deberta_modules.py
Expand Up @@ -30,6 +30,7 @@
from slue_toolkit.eval import eval_utils



class VPDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
Expand Down Expand Up @@ -76,12 +77,14 @@ def read_data(self, file_path):

def align_labels(self, tag2id, tags, encodings, label_all_tokens=False):
"""
Align labels with appropriate padding labels for sub-tokens
Align labels with appropriate padding labels for sub-tokens

label_all_tokens: Whether to put the label for one word on all tokens of generated by that word or just on the
one (in which case the other tokens will have a padding index).
"""
# TODO : Check this line
labels = [[tag2id[tag] if tag in tag2id else tag2id['O'] for tag in doc] for doc in tags]

label_all_tokens: Whether to put the label for one word on all tokens of generated by that word or just on the
one (in which case the other tokens will have a padding index).
"""
labels = [[tag2id[tag] for tag in doc] for doc in tags]
encoded_labels = []
for idx, doc_labels in enumerate(labels):
word_ids = encodings.word_ids(batch_index=idx)
Expand Down Expand Up @@ -242,7 +245,7 @@ def compute_metrics(p, return_entity_level_metrics=True):
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=eval_dataset, # evaluation dataset
eval_dataset=val_dataset, # evaluation dataset
compute_metrics=compute_metrics,
)

Expand Down Expand Up @@ -288,7 +291,7 @@ def compute_metrics(p, return_entity_level_metrics=True):
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
metrics["eval_samples"] = len(eval_dataset)
metrics["eval_samples"] = len(val_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

Expand All @@ -298,11 +301,11 @@ def __init__(
self, data_dir, model_dir, model_type, label_list, eval_label, eval_asr=False
):
"""
Inference with batch size = 1
"""
Inference with batch size = 1
"""
self.data_dir = data_dir
self.model_dir = model_dir
best_model_ckpt_dir = os.path.join(self.model_dir, "best-checkpoint")
best_model_ckpt_dir = os.path.join(self.model_dir)
self.model = DebertaForTokenClassification.from_pretrained(
best_model_ckpt_dir, output_loading_info=False
)
Expand Down