From 281572aec594a77427477cef64b1b7bdef247cdd Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Tue, 8 Feb 2022 17:16:23 -0500 Subject: [PATCH 1/6] run black --- slue_toolkit/eval/infer_asr.py | 5 +- .../fairseq_addon/data/add_label_dataset.py | 4 +- .../tasks/audio_classification.py | 8 +- slue_toolkit/text_ner/ner_deberta.py | 2 +- slue_toolkit/text_ner/ner_deberta_modules.py | 925 ++++++++++-------- slue_toolkit/text_ner/reformat_pipeline.py | 104 +- 6 files changed, 570 insertions(+), 478 deletions(-) diff --git a/slue_toolkit/eval/infer_asr.py b/slue_toolkit/eval/infer_asr.py index f7b15a7..80b3b36 100644 --- a/slue_toolkit/eval/infer_asr.py +++ b/slue_toolkit/eval/infer_asr.py @@ -89,10 +89,7 @@ def add_asr_eval_argument(parser): help="temperature scaling of the logits", ) parser.add_argument( - "--eval-upsample", - type=float, - default=1.0, - help="upsample factor", + "--eval-upsample", type=float, default=1.0, help="upsample factor", ) return parser diff --git a/slue_toolkit/fairseq_addon/data/add_label_dataset.py b/slue_toolkit/fairseq_addon/data/add_label_dataset.py index 8fb0196..db10e84 100644 --- a/slue_toolkit/fairseq_addon/data/add_label_dataset.py +++ b/slue_toolkit/fairseq_addon/data/add_label_dataset.py @@ -11,9 +11,7 @@ class AddLabelDataset(BaseWrapperDataset): def __init__( - self, - dataset, - labels, + self, dataset, labels, ): super().__init__(dataset) self.labels = labels diff --git a/slue_toolkit/fairseq_addon/tasks/audio_classification.py b/slue_toolkit/fairseq_addon/tasks/audio_classification.py index 2dbf59e..37ad1e5 100644 --- a/slue_toolkit/fairseq_addon/tasks/audio_classification.py +++ b/slue_toolkit/fairseq_addon/tasks/audio_classification.py @@ -37,8 +37,7 @@ class AudioClassificationTask(AudioPretrainingTask): cfg: AudioClassificationConfig def __init__( - self, - cfg: AudioClassificationConfig, + self, cfg: AudioClassificationConfig, ): super().__init__(cfg) self.blank_symbol = "" @@ -76,10 +75,7 @@ def load_dataset( f"({len(self.datasets[split])}) do not match" ) - self.datasets[split] = AddLabelDataset( - self.datasets[split], - labels, - ) + self.datasets[split] = AddLabelDataset(self.datasets[split], labels,) @property def label2id(self): diff --git a/slue_toolkit/text_ner/ner_deberta.py b/slue_toolkit/text_ner/ner_deberta.py index d516436..47420eb 100644 --- a/slue_toolkit/text_ner/ner_deberta.py +++ b/slue_toolkit/text_ner/ner_deberta.py @@ -1,4 +1,4 @@ -import os,fire +import os, fire import slue_toolkit.text_ner.ner_deberta_modules as NDM from slue_toolkit.generic_utils import read_lst, load_pkl, save_pkl diff --git a/slue_toolkit/text_ner/ner_deberta_modules.py b/slue_toolkit/text_ner/ner_deberta_modules.py index 992cf21..aec6fb7 100644 --- a/slue_toolkit/text_ner/ner_deberta_modules.py +++ b/slue_toolkit/text_ner/ner_deberta_modules.py @@ -1,4 +1,5 @@ -import logging,os,re +import logging, os, re + logger = logging.getLogger(__name__) import numpy as np from pathlib import Path @@ -9,307 +10,337 @@ import datasets import transformers from transformers import ( - set_seed, - Trainer, - TrainingArguments, - DebertaTokenizerFast, - DebertaForTokenClassification + set_seed, + Trainer, + TrainingArguments, + DebertaTokenizerFast, + DebertaForTokenClassification, ) from transformers.trainer_utils import get_last_checkpoint from slue_toolkit.generic_utils import read_lst, write_to_file, load_pkl, save_pkl + class VPDataset(torch.utils.data.Dataset): - def __init__(self, encodings, labels): - self.encodings = encodings - self.labels = labels - - def __getitem__(self, idx): - item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} - item['labels'] = torch.tensor(self.labels[idx]) - return item - - def __len__(self): - return len(self.labels) - -class DataSetup(): - def __init__(self, data_dir, model_type): - self.data_dir = data_dir - self.tokenizer = DebertaTokenizerFast.from_pretrained(f'microsoft/{model_type}', add_prefix_space=True, output_loading_info=False) - - def read_data(self, file_path): - file_path = Path(os.path.join(self.data_dir, file_path)) - - raw_text = file_path.read_text().strip() - raw_docs = re.split(r'\n\t?\n', raw_text) - token_docs = [] - tag_docs = [] - for doc in raw_docs: - tokens = [] - tags = [] - for line in doc.split('\n'): - token, tag = line.split('\t') - tokens.append(token) - tags.append(tag) - token_docs.append(tokens) - tag_docs.append(tags) - - return token_docs, tag_docs - - def align_labels(self, tag2id, tags, encodings, label_all_tokens=False): - """ + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + item["labels"] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) + + +class DataSetup: + def __init__(self, data_dir, model_type): + self.data_dir = data_dir + self.tokenizer = DebertaTokenizerFast.from_pretrained( + f"microsoft/{model_type}", add_prefix_space=True, output_loading_info=False + ) + + def read_data(self, file_path): + file_path = Path(os.path.join(self.data_dir, file_path)) + + raw_text = file_path.read_text().strip() + raw_docs = re.split(r"\n\t?\n", raw_text) + token_docs = [] + tag_docs = [] + for doc in raw_docs: + tokens = [] + tags = [] + for line in doc.split("\n"): + token, tag = line.split("\t") + tokens.append(token) + tags.append(tag) + token_docs.append(tokens) + tag_docs.append(tags) + + return token_docs, tag_docs + + def align_labels(self, tag2id, tags, encodings, label_all_tokens=False): + """ Align labels with appropriate padding labels for sub-tokens label_all_tokens: Whether to put the label for one word on all tokens of generated by that word or just on the one (in which case the other tokens will have a padding index). """ - labels = [[tag2id[tag] for tag in doc] for doc in tags] - encoded_labels = [] - for idx, doc_labels in enumerate(labels): - word_ids = encodings.word_ids(batch_index=idx) - previous_word_idx = None - label_ids = [] - for word_idx in word_ids: - # Special tokens have a word id that is None. We set the label to -100 so they are automatically - # ignored in the loss function. - if word_idx is None: - label_ids.append(-100) - # We set the label for the first token of each word. - elif word_idx != previous_word_idx: - label_ids.append(doc_labels[word_idx]) - # For the other tokens in a word, we set the label to either the current label or -100, depending on - # the label_all_tokens flag. - else: - label_ids.append(doc_labels[word_idx] if label_all_tokens else -100) - previous_word_idx = word_idx - - encoded_labels.append(label_ids) - return encoded_labels - - def prep_data(self, split_name, label_type="raw", get_map_files=False): - texts, tags = self.read_data(f"{split_name}_{label_type}.tsv") - - tag_id_fn = os.path.join(self.data_dir, f"{label_type}_tag2id.pkl") - if not os.path.exists(tag_id_fn): - # Create encodings - unique_tags = set(tag for doc in tags for tag in doc) - tag2id = {tag: id for id, tag in enumerate(unique_tags)} - id2tag = {id: tag for tag, id in tag2id.items()} - save_pkl(tag_id_fn, tag2id) - save_pkl(os.path.join(self.data_dir, f"{label_type}_id2tag.pkl"), id2tag) - write_to_file("\n".join(list(unique_tags)), os.path.join(self.data_dir, f"{label_type}_tag_lst_ordered")) - else: - tag2id = load_pkl(tag_id_fn) - if get_map_files: - return 1 - # Tokenize data - encodings = self.tokenizer(texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True) - labels = self.align_labels(tag2id, tags, encodings) - encodings.pop("offset_mapping") # we don't want to pass this to the model - dataset = VPDataset(encodings, labels) - return texts, tags, encodings, labels, dataset - -def train_module(data_dir, model_dir, train_dataset, val_dataset, label_list, model_type): - def compute_metrics(p, return_entity_level_metrics=True): - predictions, labels = p - predictions = np.argmax(predictions, axis=2) - - # Remove ignored index (special tokens); does NOT filter out the I- labels - # but just any trailing non-labels due to tokenization - true_predictions = [ - [label_list[p] for (p, l) in zip(prediction, label) if l != -100] - for prediction, label in zip(predictions, labels) - ] - true_labels = [ - [label_list[l] for (p, l) in zip(prediction, label) if l != -100] - for prediction, label in zip(predictions, labels) - ] - - metric = datasets.load_metric("seqeval") - results = metric.compute(predictions=true_predictions, references=true_labels) - if return_entity_level_metrics: - # Unpack nested dictionaries - final_results = {} - for key, value in results.items(): - if isinstance(value, dict): - for n, v in value.items(): - final_results[f"{key}_{n}"] = v - else: - final_results[key] = value - return final_results - else: - return { - "precision": results["overall_precision"], - "recall": results["overall_recall"], - "f1": results["overall_f1"], - "accuracy": results["overall_accuracy"], - } - - model = DebertaForTokenClassification.from_pretrained(f'microsoft/{model_type}', num_labels=len(label_list)) - - logging_steps = 50 - eval_steps = 50 - accum_steps = 1 - warmup_steps = 50 - if "large" in model_type: - num_epochs = 50 - elif "base" in model_type: - num_epochs = 10 - - # Training - training_args = TrainingArguments( - output_dir=model_dir, # output directory - overwrite_output_dir=True, - num_train_epochs=num_epochs, # total number of training epochs - per_device_train_batch_size=16, # batch size per device during training - per_device_eval_batch_size=64, # batch size for evaluation - warmup_steps=warmup_steps, # number of warmup steps for learning rate scheduler - weight_decay=0.01, # strength of weight decay - logging_dir=f'{model_dir}/hf-logs', # directory for storing logs - logging_first_step = True, - logging_steps=logging_steps, - eval_steps=eval_steps, - logging_strategy="steps", - evaluation_strategy="steps", - gradient_accumulation_steps=accum_steps, - log_level="info", - load_best_model_at_end=True, - metric_for_best_model="eval_overall_f1", - greater_is_better=True, - report_to="none", - do_train=True, - do_eval=True, - save_total_limit=5, - ) - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - - log_level = training_args.get_process_log_level() - logger.setLevel(log_level) - datasets.utils.logging.set_verbosity(log_level) - transformers.utils.logging.set_verbosity(log_level) - transformers.utils.logging.enable_default_handler() - transformers.utils.logging.enable_explicit_format() - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - logger.info(f"Training/evaluation parameters {training_args}") - - # Set seed before initializing model. - set_seed(training_args.seed) - - trainer = Trainer( - model=model, # the instantiated 🤗 Transformers model to be trained - args=training_args, # training arguments, defined above - train_dataset=train_dataset, # training dataset - eval_dataset=eval_dataset, # evaluation dataset - compute_metrics=compute_metrics, - ) - - # Detecting last checkpoint. - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - else: - last_checkpoint = None - - # Training - if training_args.do_train: - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - else: - checkpoint = None - train_result = trainer.train(resume_from_checkpoint=checkpoint) - metrics = train_result.metrics - trainer.save_model() # Saves the tokenizer too for easy upload - metrics["train_samples"] = len(train_dataset) - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - metrics = trainer.evaluate() - metrics["eval_samples"] = len(eval_dataset) - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - -class Eval(): - def __init__(self, data_dir, model_dir, model_type, label_list, eval_label, eval_asr=False): - """ + labels = [[tag2id[tag] for tag in doc] for doc in tags] + encoded_labels = [] + for idx, doc_labels in enumerate(labels): + word_ids = encodings.word_ids(batch_index=idx) + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + # Special tokens have a word id that is None. We set the label to -100 so they are automatically + # ignored in the loss function. + if word_idx is None: + label_ids.append(-100) + # We set the label for the first token of each word. + elif word_idx != previous_word_idx: + label_ids.append(doc_labels[word_idx]) + # For the other tokens in a word, we set the label to either the current label or -100, depending on + # the label_all_tokens flag. + else: + label_ids.append(doc_labels[word_idx] if label_all_tokens else -100) + previous_word_idx = word_idx + + encoded_labels.append(label_ids) + return encoded_labels + + def prep_data(self, split_name, label_type="raw", get_map_files=False): + texts, tags = self.read_data(f"{split_name}_{label_type}.tsv") + + tag_id_fn = os.path.join(self.data_dir, f"{label_type}_tag2id.pkl") + if not os.path.exists(tag_id_fn): + # Create encodings + unique_tags = set(tag for doc in tags for tag in doc) + tag2id = {tag: id for id, tag in enumerate(unique_tags)} + id2tag = {id: tag for tag, id in tag2id.items()} + save_pkl(tag_id_fn, tag2id) + save_pkl(os.path.join(self.data_dir, f"{label_type}_id2tag.pkl"), id2tag) + write_to_file( + "\n".join(list(unique_tags)), + os.path.join(self.data_dir, f"{label_type}_tag_lst_ordered"), + ) + else: + tag2id = load_pkl(tag_id_fn) + if get_map_files: + return 1 + # Tokenize data + encodings = self.tokenizer( + texts, + is_split_into_words=True, + return_offsets_mapping=True, + padding=True, + truncation=True, + ) + labels = self.align_labels(tag2id, tags, encodings) + encodings.pop("offset_mapping") # we don't want to pass this to the model + dataset = VPDataset(encodings, labels) + return texts, tags, encodings, labels, dataset + + +def train_module( + data_dir, model_dir, train_dataset, val_dataset, label_list, model_type +): + def compute_metrics(p, return_entity_level_metrics=True): + predictions, labels = p + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens); does NOT filter out the I- labels + # but just any trailing non-labels due to tokenization + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + metric = datasets.load_metric("seqeval") + results = metric.compute(predictions=true_predictions, references=true_labels) + if return_entity_level_metrics: + # Unpack nested dictionaries + final_results = {} + for key, value in results.items(): + if isinstance(value, dict): + for n, v in value.items(): + final_results[f"{key}_{n}"] = v + else: + final_results[key] = value + return final_results + else: + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } + + model = DebertaForTokenClassification.from_pretrained( + f"microsoft/{model_type}", num_labels=len(label_list) + ) + + logging_steps = 50 + eval_steps = 50 + accum_steps = 1 + warmup_steps = 50 + if "large" in model_type: + num_epochs = 50 + elif "base" in model_type: + num_epochs = 10 + + # Training + training_args = TrainingArguments( + output_dir=model_dir, # output directory + overwrite_output_dir=True, + num_train_epochs=num_epochs, # total number of training epochs + per_device_train_batch_size=16, # batch size per device during training + per_device_eval_batch_size=64, # batch size for evaluation + warmup_steps=warmup_steps, # number of warmup steps for learning rate scheduler + weight_decay=0.01, # strength of weight decay + logging_dir=f"{model_dir}/hf-logs", # directory for storing logs + logging_first_step=True, + logging_steps=logging_steps, + eval_steps=eval_steps, + logging_strategy="steps", + evaluation_strategy="steps", + gradient_accumulation_steps=accum_steps, + log_level="info", + load_best_model_at_end=True, + metric_for_best_model="eval_overall_f1", + greater_is_better=True, + report_to="none", + do_train=True, + do_eval=True, + save_total_limit=5, + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Set seed before initializing model. + set_seed(training_args.seed) + + trainer = Trainer( + model=model, # the instantiated 🤗 Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=train_dataset, # training dataset + eval_dataset=eval_dataset, # evaluation dataset + compute_metrics=compute_metrics, + ) + + # Detecting last checkpoint. + if ( + os.path.isdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif ( + last_checkpoint is not None and training_args.resume_from_checkpoint is None + ): + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + else: + last_checkpoint = None + + # Training + if training_args.do_train: + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + else: + checkpoint = None + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + trainer.save_model() # Saves the tokenizer too for easy upload + metrics["train_samples"] = len(train_dataset) + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate() + metrics["eval_samples"] = len(eval_dataset) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + +class Eval: + def __init__( + self, data_dir, model_dir, model_type, label_list, eval_label, eval_asr=False + ): + """ Inference with batch size = 1 """ - self.data_dir = data_dir - self.model_dir = model_dir - best_model_ckpt_dir = os.path.join(self.model_dir, "best-checkpoint") - self.model = DebertaForTokenClassification.from_pretrained(best_model_ckpt_dir, output_loading_info=False) - self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') - self.model.to(self.device) - self.model.eval() - - self.eval_asr = eval_asr - self.label_list = label_list - self.eval_label = eval_label - - def reduce(self, entity_name): - return entity_name.split("-")[-1] - - def update_entity_lst(self, lst, entity_name, score_type, entity_info): - """ + self.data_dir = data_dir + self.model_dir = model_dir + best_model_ckpt_dir = os.path.join(self.model_dir, "best-checkpoint") + self.model = DebertaForTokenClassification.from_pretrained( + best_model_ckpt_dir, output_loading_info=False + ) + self.device = ( + torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + ) + self.model.to(self.device) + self.model.eval() + + self.eval_asr = eval_asr + self.label_list = label_list + self.eval_label = eval_label + + def reduce(self, entity_name): + return entity_name.split("-")[-1] + + def update_entity_lst(self, lst, entity_name, score_type, entity_info): + """ entity_info: word segment when eval_asr is True and word location otherwise """ - if self.eval_asr: - if score_type == "standard": - lst.append((self.reduce(entity_name), " ".join(entity_info))) - elif score_type == "label": - lst.append((self.reduce(entity_name), "word")) - else: - if score_type == "standard": - lst.append((self.reduce(entity_name), entity_info[0], entity_info[-1])) - elif score_type == "label": - lst.append((self.reduce(entity_name), 0, 0)) - - def make_distinct(self, tag_lst): - """ + if self.eval_asr: + if score_type == "standard": + lst.append((self.reduce(entity_name), " ".join(entity_info))) + elif score_type == "label": + lst.append((self.reduce(entity_name), "word")) + else: + if score_type == "standard": + lst.append((self.reduce(entity_name), entity_info[0], entity_info[-1])) + elif score_type == "label": + lst.append((self.reduce(entity_name), 0, 0)) + + def make_distinct(self, tag_lst): + """ Make enities disticnt in a list For instance, when eval_asr == True input: [('PER', 'MARY'), ('LOC', "SAINT PAUL'S"), ('PER', 'KIRKLEATHAM'), ('PER', 'MARY')] output: [('PER', 'MARY', 1), ('LOC', "SAINT PAUL'S", 1), ('PER', 'KIRKLEATHAM', 1), ('PER', 'MARY', 2)] """ - tag2cnt, new_tag_lst = {}, [] - for tag_item in tag_lst[0]: - _ = tag2cnt.setdefault(tag_item, 0) - tag2cnt[tag_item] += 1 - if self.eval_asr: - tag, wrd = tag_item - new_tag_lst.append((tag, wrd, tag2cnt[tag_item])) - else: - tag, _, _ = tag_item - new_tag_lst.append((tag, 0, tag2cnt[tag_item])) - return [new_tag_lst] - - def get_entities(self, tag_lst, score_type, text_lst=None): - """ + tag2cnt, new_tag_lst = {}, [] + for tag_item in tag_lst[0]: + _ = tag2cnt.setdefault(tag_item, 0) + tag2cnt[tag_item] += 1 + if self.eval_asr: + tag, wrd = tag_item + new_tag_lst.append((tag, wrd, tag2cnt[tag_item])) + else: + tag, _, _ = tag_item + new_tag_lst.append((tag, 0, tag2cnt[tag_item])) + return [new_tag_lst] + + def get_entities(self, tag_lst, score_type, text_lst=None): + """ Convert entity tag list to the list of (entity-name, location) tuples Example: >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC'] @@ -318,138 +349,200 @@ def get_entities(self, tag_lst, score_type, text_lst=None): >>> get_entities(seq, "label") [("tag", ), ("tag", )] """ - if self.eval_asr: - assert text_lst is not None - entity_tag_lst = [] - entity_flag, entity_info, entity_lst, entity_name, prev_tag = False, [], [], None, "O" - for tag_idx, tag_name in enumerate(tag_lst): - if tag_name != "O": - if "B-" in tag_name or ("I-" in tag_name and self.reduce(tag_name)!=self.reduce(prev_tag)): # start of a new entity - if entity_flag: # record the previous entity first - self.update_entity_lst(entity_lst, entity_name, score_type, entity_info) - entity_name = tag_name - if self.eval_asr: - entity_info = [text_lst[tag_idx]] - else: - entity_info = [tag_idx] - entity_flag = True - else: # if "I-" in tag_name and reduce(tag_name) == reduce(prev_tag): # continuation of the entity - assert self.reduce(entity_name) == self.reduce(tag_name) - assert entity_flag - if self.eval_asr: - entity_info.append(text_lst[tag_idx]) - else: - entity_info.append(tag_idx) - else: - if entity_flag: - self.update_entity_lst(entity_lst, entity_name, score_type, entity_info) - entity_loc = [] - entity_flag = False - entity_name = None - if tag_idx == len(tag_lst)-1: - if entity_flag: - self.update_entity_lst(entity_lst, entity_name, score_type, entity_info) - prev_tag = tag_name - entity_tag_lst.append(entity_lst) - if score_type == "label" or self.eval_asr: - return self.make_distinct(entity_tag_lst) - else: - return entity_tag_lst - - def get_tag_map(self, indices=False, tag_names=False): - """ + if self.eval_asr: + assert text_lst is not None + entity_tag_lst = [] + entity_flag, entity_info, entity_lst, entity_name, prev_tag = ( + False, + [], + [], + None, + "O", + ) + for tag_idx, tag_name in enumerate(tag_lst): + if tag_name != "O": + if "B-" in tag_name or ( + "I-" in tag_name and self.reduce(tag_name) != self.reduce(prev_tag) + ): # start of a new entity + if entity_flag: # record the previous entity first + self.update_entity_lst( + entity_lst, entity_name, score_type, entity_info + ) + entity_name = tag_name + if self.eval_asr: + entity_info = [text_lst[tag_idx]] + else: + entity_info = [tag_idx] + entity_flag = True + else: # if "I-" in tag_name and reduce(tag_name) == reduce(prev_tag): # continuation of the entity + assert self.reduce(entity_name) == self.reduce(tag_name) + assert entity_flag + if self.eval_asr: + entity_info.append(text_lst[tag_idx]) + else: + entity_info.append(tag_idx) + else: + if entity_flag: + self.update_entity_lst( + entity_lst, entity_name, score_type, entity_info + ) + entity_loc = [] + entity_flag = False + entity_name = None + if tag_idx == len(tag_lst) - 1: + if entity_flag: + self.update_entity_lst( + entity_lst, entity_name, score_type, entity_info + ) + prev_tag = tag_name + entity_tag_lst.append(entity_lst) + if score_type == "label" or self.eval_asr: + return self.make_distinct(entity_tag_lst) + else: + return entity_tag_lst + + def get_tag_map(self, indices=False, tag_names=False): + """ Mapping raw tag ids to the combined tag ids """ - assert indices or tag_names - assert not (indices and tag_names) - raw_to_combined_tag_map = load_pkl(os.path.join("slue_toolkit/label_map_files", "raw_to_combined_tags.pkl")) - if indices: - id2tag_raw = load_pkl(os.path.join(self.data_dir, "raw_id2tag.pkl")) - tag2id_raw = load_pkl(os.path.join(self.data_dir, "raw_tag2id.pkl")) - id2tag_combined = load_pkl(os.path.join(self.data_dir, "combined_id2tag.pkl")) - tag2id_combined = load_pkl(os.path.join(self.data_dir, "combined_tag2id.pkl")) - raw_to_combined_id = {} - for key, value in raw_to_combined_tag_map.items(): - for pfx in ["B-", "I-"]: - raw_id = tag2id_raw[pfx+tag] - if value != "DISCARD": - combined_id = tag2id_combined[pfx+value] - else: - combined_id = tag2id_combined["O"] - assert raw_id not in raw_to_combined_id - raw_to_combined_id[raw_id] = combined_id - raw_to_combined_id[tag2id_raw["O"]] = tag2id_combined["O"] - raw_to_combined_id[-100] = -100 - return raw_to_combined_id - elif tag_names: - tag_map_dct = {"O": "O"} - for key, value in combined_tag_dct.items(): - for item in value: - for pfx in ["B-", "I-"]: - if key != "DISCARD": - tag_map_dct[pfx+item] = pfx+key - else: - tag_map_dct[pfx+item] = "O" - return tag_map_dct - - def get_entity_tags(self, predictions, labels, score_type, gt_text=None, gt_tags=None, pred_text=None): - if "combined" in self.eval_label: - tag_map_dct = self.get_tag_map(indices=True) - predictions = [[tag_map_dct[item] for item in prediction] for prediction in predictions] - labels = [[tag_map_dct[item] for item in label] for label in labels] - entity_predictions = [ - [self.label_list[p] for (p, l) in zip(prediction, label) if l != -100] - for prediction, label in zip(predictions, labels) - ] - entity_labels = [ - [self.label_list[l] for (p, l) in zip(prediction, label) if l != -100] - for prediction, label in zip(predictions, labels) - ] - - entity_predictions_reformat = self.get_entities(entity_predictions[0], score_type, pred_text) - if self.eval_asr: - entity_labels_reformat = self.get_entities(gt_tags, score_type, gt_text) - else: - entity_labels_reformat = self.get_entities(entity_labels[0], score_type) - assert len(entity_labels_reformat[0]) == len(set(entity_labels_reformat[0])) - assert len(entity_predictions_reformat[0]) == len(set(entity_predictions_reformat[0])) - - return entity_predictions_reformat, entity_labels_reformat - - def run_inference(self, score_type, eval_dataset_pred, eval_texts_gt, eval_tags_gt=None, eval_texts_pred=None): - all_labels = [] - all_predictions = [] - if "combined" in self.eval_label: - tag_map_dct = self.get_tag_map(tag_names=True) - data_loader = DataLoader(eval_dataset_pred, batch_size=1, shuffle=False) - for idx, batch in enumerate(data_loader): - input_ids = batch['input_ids'].to(self.device) - attention_mask = batch['attention_mask'].to(self.device) - labels = batch['labels'].detach().numpy() - outputs = self.model(input_ids, attention_mask=attention_mask) - predictions = np.argmax(outputs.logits.cpu().detach().numpy(), axis=2) - if self.eval_asr: - if "combined" in self.eval_label: - eval_tags_text = [tag_map_dct[item] for item in eval_tags_gt[idx]] - else: - eval_tags_text = eval_tags_gt[idx] - entity_predictions, entity_labels = self.get_entity_tags(predictions, labels, score_type, \ - eval_texts_gt[idx], eval_tags_text, eval_texts_pred[idx]) - else: - entity_predictions, entity_labels = self.get_entity_tags(predictions, labels, score_type) - all_labels.extend(entity_labels) - all_predictions.extend(entity_predictions) - - return all_labels, all_predictions - - def get_scores(self, score_type, eval_dataset_pred, eval_texts_gt, eval_tags_gt=None, eval_texts_pred=None): - all_gt, all_predictions = self.run_inference(score_type, eval_dataset_pred, eval_texts_gt, eval_tags_gt, eval_texts_pred) - - metrics_dct = eval_utils.get_scores(all_gt, all_predictions) - print("[%s, micro-averaged %s] Precision: %.2f, recall: %.2f, fscore = %.2f" % ( - tag_name, res_dct["precision"], res_dct["recall"], res_dct["fscore"])) - - if score_type == "standard": # with standard evaluation only - analysis_examples_dct = eval_utils.error_analysis(all_labels, all_predictions, eval_texts_gt) - - return metrics_dct, analysis_examples_dct + assert indices or tag_names + assert not (indices and tag_names) + raw_to_combined_tag_map = load_pkl( + os.path.join("slue_toolkit/label_map_files", "raw_to_combined_tags.pkl") + ) + if indices: + id2tag_raw = load_pkl(os.path.join(self.data_dir, "raw_id2tag.pkl")) + tag2id_raw = load_pkl(os.path.join(self.data_dir, "raw_tag2id.pkl")) + id2tag_combined = load_pkl( + os.path.join(self.data_dir, "combined_id2tag.pkl") + ) + tag2id_combined = load_pkl( + os.path.join(self.data_dir, "combined_tag2id.pkl") + ) + raw_to_combined_id = {} + for key, value in raw_to_combined_tag_map.items(): + for pfx in ["B-", "I-"]: + raw_id = tag2id_raw[pfx + tag] + if value != "DISCARD": + combined_id = tag2id_combined[pfx + value] + else: + combined_id = tag2id_combined["O"] + assert raw_id not in raw_to_combined_id + raw_to_combined_id[raw_id] = combined_id + raw_to_combined_id[tag2id_raw["O"]] = tag2id_combined["O"] + raw_to_combined_id[-100] = -100 + return raw_to_combined_id + elif tag_names: + tag_map_dct = {"O": "O"} + for key, value in combined_tag_dct.items(): + for item in value: + for pfx in ["B-", "I-"]: + if key != "DISCARD": + tag_map_dct[pfx + item] = pfx + key + else: + tag_map_dct[pfx + item] = "O" + return tag_map_dct + + def get_entity_tags( + self, + predictions, + labels, + score_type, + gt_text=None, + gt_tags=None, + pred_text=None, + ): + if "combined" in self.eval_label: + tag_map_dct = self.get_tag_map(indices=True) + predictions = [ + [tag_map_dct[item] for item in prediction] for prediction in predictions + ] + labels = [[tag_map_dct[item] for item in label] for label in labels] + entity_predictions = [ + [self.label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + entity_labels = [ + [self.label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + entity_predictions_reformat = self.get_entities( + entity_predictions[0], score_type, pred_text + ) + if self.eval_asr: + entity_labels_reformat = self.get_entities(gt_tags, score_type, gt_text) + else: + entity_labels_reformat = self.get_entities(entity_labels[0], score_type) + assert len(entity_labels_reformat[0]) == len(set(entity_labels_reformat[0])) + assert len(entity_predictions_reformat[0]) == len( + set(entity_predictions_reformat[0]) + ) + + return entity_predictions_reformat, entity_labels_reformat + + def run_inference( + self, + score_type, + eval_dataset_pred, + eval_texts_gt, + eval_tags_gt=None, + eval_texts_pred=None, + ): + all_labels = [] + all_predictions = [] + if "combined" in self.eval_label: + tag_map_dct = self.get_tag_map(tag_names=True) + data_loader = DataLoader(eval_dataset_pred, batch_size=1, shuffle=False) + for idx, batch in enumerate(data_loader): + input_ids = batch["input_ids"].to(self.device) + attention_mask = batch["attention_mask"].to(self.device) + labels = batch["labels"].detach().numpy() + outputs = self.model(input_ids, attention_mask=attention_mask) + predictions = np.argmax(outputs.logits.cpu().detach().numpy(), axis=2) + if self.eval_asr: + if "combined" in self.eval_label: + eval_tags_text = [tag_map_dct[item] for item in eval_tags_gt[idx]] + else: + eval_tags_text = eval_tags_gt[idx] + entity_predictions, entity_labels = self.get_entity_tags( + predictions, + labels, + score_type, + eval_texts_gt[idx], + eval_tags_text, + eval_texts_pred[idx], + ) + else: + entity_predictions, entity_labels = self.get_entity_tags( + predictions, labels, score_type + ) + all_labels.extend(entity_labels) + all_predictions.extend(entity_predictions) + + return all_labels, all_predictions + + def get_scores( + self, + score_type, + eval_dataset_pred, + eval_texts_gt, + eval_tags_gt=None, + eval_texts_pred=None, + ): + all_gt, all_predictions = self.run_inference( + score_type, eval_dataset_pred, eval_texts_gt, eval_tags_gt, eval_texts_pred + ) + + metrics_dct = eval_utils.get_scores(all_gt, all_predictions) + print( + "[%s, micro-averaged %s] Precision: %.2f, recall: %.2f, fscore = %.2f" + % (tag_name, res_dct["precision"], res_dct["recall"], res_dct["fscore"]) + ) + + if score_type == "standard": # with standard evaluation only + analysis_examples_dct = eval_utils.error_analysis( + all_labels, all_predictions, eval_texts_gt + ) + + return metrics_dct, analysis_examples_dct diff --git a/slue_toolkit/text_ner/reformat_pipeline.py b/slue_toolkit/text_ner/reformat_pipeline.py index 81c1ac5..f8bd60b 100644 --- a/slue_toolkit/text_ner/reformat_pipeline.py +++ b/slue_toolkit/text_ner/reformat_pipeline.py @@ -3,57 +3,65 @@ from slue_toolkit.generic_utils import read_lst, write_to_file -def prep_data(model_type, asr_data_dir, asr_model_dir, out_data_dir, eval_set, lm="nolm"): - """ + +def prep_data( + model_type, asr_data_dir, asr_model_dir, out_data_dir, eval_set, lm="nolm" +): + """ Create tsv files for pipeline evaluation from the decoded ASR transcripts """ - if "nolm" not in lm: - lm = "t3-b500-lw2-ws-1" - manifest_data_fn = os.path.join(asr_data_dir, eval_set+".wrd") - decoded_data_dir = os.path.join(asr_model_dir, "decode", eval_set, lm) - - out_fn = f"{eval_subset}-{model_type}-asr-{lm}" - out_fn = os.path.join(out_data_dir, out_fn) - sent_lst = get_correct_order(decoded_data_dir, manifest_data_fn) - out_str = "" - for sent in sent_lst: - for wrd in sent.split(" "): - out_str += wrd+"\tO\n" - out_str += "\n" - write_to_file(out_str, out_fn) - print("Data prepared for model %s and lm %s" % (model_name, lm)) + if "nolm" not in lm: + lm = "t3-b500-lw2-ws-1" + manifest_data_fn = os.path.join(asr_data_dir, eval_set + ".wrd") + decoded_data_dir = os.path.join(asr_model_dir, "decode", eval_set, lm) + + out_fn = f"{eval_subset}-{model_type}-asr-{lm}" + out_fn = os.path.join(out_data_dir, out_fn) + sent_lst = get_correct_order(decoded_data_dir, manifest_data_fn) + out_str = "" + for sent in sent_lst: + for wrd in sent.split(" "): + out_str += wrd + "\tO\n" + out_str += "\n" + write_to_file(out_str, out_fn) + print("Data prepared for model %s and lm %s" % (model_name, lm)) + def get_correct_order(self, decoded_data_dir, manifest_data_fn): - """ + """ Reorder decoded sentenced to match the original order """ - if not os.path.exists(decoded_data_dir): - print("Decoded data %s not found" % (decoded_data_dir)) - sys.exit() - else: - fname = glob.glob(decoded_data_dir+"/ref.word*") - assert len(fname) == 1 - decoded_sent_lst_gt = read_lst(fname[0]) - - fname = glob.glob(decoded_data_dir+"/hypo.word*") - assert len(fname) == 1 - decoded_sent_lst_hyp = read_lst(fname[0]) - - manifest_sent_lst = read_lst(manifest_data_fn) - - assert len(decoded_sent_lst_gt) == len(manifest_sent_lst) - assert len(decoded_sent_lst_hyp) == len(decoded_sent_lst_gt) - - decoded_sent_lst_hyp_select = [line.split(" (None-")[0] for line in decoded_sent_lst_hyp] - decoded_sent_lst_gt = [line.split(" (None-")[0] for idx, line in enumerate(decoded_sent_lst_gt)] - decoded_sent_lst_reordered = [None]*len(manifest_sent_lst) - for idx, line in enumerate(decoded_sent_lst_gt): - assert line != -1 - idx_new = manifest_sent_lst.index(line) - manifest_sent_lst[idx_new] = -1 # to ensure that it's not chosen again - decoded_sent_lst_reordered[idx_new] = decoded_sent_lst_hyp_select[idx] - return decoded_sent_lst_reordered - - -if __name__ == '__main__': - fire.Fire() + if not os.path.exists(decoded_data_dir): + print("Decoded data %s not found" % (decoded_data_dir)) + sys.exit() + else: + fname = glob.glob(decoded_data_dir + "/ref.word*") + assert len(fname) == 1 + decoded_sent_lst_gt = read_lst(fname[0]) + + fname = glob.glob(decoded_data_dir + "/hypo.word*") + assert len(fname) == 1 + decoded_sent_lst_hyp = read_lst(fname[0]) + + manifest_sent_lst = read_lst(manifest_data_fn) + + assert len(decoded_sent_lst_gt) == len(manifest_sent_lst) + assert len(decoded_sent_lst_hyp) == len(decoded_sent_lst_gt) + + decoded_sent_lst_hyp_select = [ + line.split(" (None-")[0] for line in decoded_sent_lst_hyp + ] + decoded_sent_lst_gt = [ + line.split(" (None-")[0] for idx, line in enumerate(decoded_sent_lst_gt) + ] + decoded_sent_lst_reordered = [None] * len(manifest_sent_lst) + for idx, line in enumerate(decoded_sent_lst_gt): + assert line != -1 + idx_new = manifest_sent_lst.index(line) + manifest_sent_lst[idx_new] = -1 # to ensure that it's not chosen again + decoded_sent_lst_reordered[idx_new] = decoded_sent_lst_hyp_select[idx] + return decoded_sent_lst_reordered + + +if __name__ == "__main__": + fire.Fire() From 66672ec6acd54182888d58eaeda28ff0bcda52da Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Tue, 8 Feb 2022 17:24:13 -0500 Subject: [PATCH 2/6] fix gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9bea433..95213b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,7 @@ - +# general .DS_Store +datasets/ +manifest/ +save/ +slue_toolkit.egg-info/ +__pycache__/ From 4f95dbf27d9bf8d875a9a6c91f247eb1c9708ad1 Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Tue, 8 Feb 2022 21:57:26 -0500 Subject: [PATCH 3/6] fix text ner pipeline --- .gitignore | 2 +- baselines/sentiment/README.md | 2 +- baselines/sentiment/pipeline_scripts/eval.sh | 4 ++-- scripts/download_datasets.sh | 8 ++++---- setup.py | 3 +++ slue_toolkit/prepare/prepare_voxceleb.py | 2 +- slue_toolkit/prepare/prepare_voxceleb_asr_pred.py | 2 +- slue_toolkit/prepare/prepare_voxpopuli.py | 2 +- slue_toolkit/text_ner/ner_deberta_modules.py | 11 ++++++----- 9 files changed, 20 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 95213b8..ab1e0f2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # general .DS_Store -datasets/ +dataset/ manifest/ save/ slue_toolkit.egg-info/ diff --git a/baselines/sentiment/README.md b/baselines/sentiment/README.md index 89eb06a..b1b3e9d 100644 --- a/baselines/sentiment/README.md +++ b/baselines/sentiment/README.md @@ -31,6 +31,6 @@ To evaluate the fine-tuned nlp model, run following command or run `baselines/se First, ASR transcription need to be prepared in manifest dir, and then evalution can be done using the same evaluation script with nlp topline. ```sh -python slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data datasets/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000 +python slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data dataset/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000 python slue_toolkit/eval/eval_nlp_sentiment.py --save-dir save/sentiment/nlp_topline_bert-base-cased --data manifest/slue-voxceleb --subset test.asr-pred ``` diff --git a/baselines/sentiment/pipeline_scripts/eval.sh b/baselines/sentiment/pipeline_scripts/eval.sh index a77cbdb..60eccc8 100644 --- a/baselines/sentiment/pipeline_scripts/eval.sh +++ b/baselines/sentiment/pipeline_scripts/eval.sh @@ -1,6 +1,6 @@ #!/bin/bash -python3 slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data datasets/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000 +python3 slue_toolkit/prepare/prepare_voxceleb_asr_pred.py --data manifest/slue-voxceleb --pred-data dataset/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000 python3 slue_toolkit/eval/eval_nlp_sentiment.py \ --data manifest/slue-voxceleb \ @@ -8,4 +8,4 @@ python3 slue_toolkit/eval/eval_nlp_sentiment.py \ --save-dir save/sentiment/nlp_topline_bert-base-cased \ --use-gpu \ --eval \ - \ No newline at end of file + diff --git a/scripts/download_datasets.sh b/scripts/download_datasets.sh index c25e2b4..dae9d4b 100644 --- a/scripts/download_datasets.sh +++ b/scripts/download_datasets.sh @@ -1,12 +1,12 @@ #!/bin/bash #1. Download -wget https://papers-slue.awsdev.asapp.com/slue-voxceleb_blind.tar.gz -P datasets/ -wget https://papers-slue.awsdev.asapp.com/slue-voxpopuli_blind.tar.gz -P datasets/ +wget https://papers-slue.awsdev.asapp.com/slue-voxceleb_blind.tar.gz -P dataset/ +wget https://papers-slue.awsdev.asapp.com/slue-voxpopuli_blind.tar.gz -P dataset/ #2. Extract -tar -xzvf datasets/slue-voxceleb_blind.tar.gz -C datasets/ -tar -xzvf datasets/slue-voxpopuli_blind.tar.gz -C datasets/ +tar -xzvf dataset/slue-voxceleb_blind.tar.gz -C dataset/ +tar -xzvf dataset/slue-voxpopuli_blind.tar.gz -C dataset/ #3. preprocess diff --git a/setup.py b/setup.py index ca20db4..a2d9b59 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,9 @@ "fire", "editdistance", "soundfile", + "transformers", + "datasets", + "seqeval", ], entry_points={}, include_package_data=True, diff --git a/slue_toolkit/prepare/prepare_voxceleb.py b/slue_toolkit/prepare/prepare_voxceleb.py index 62d57f4..c3795ba 100644 --- a/slue_toolkit/prepare/prepare_voxceleb.py +++ b/slue_toolkit/prepare/prepare_voxceleb.py @@ -106,7 +106,7 @@ def create_split( def create_manifest( - data_dir="datasets/slue-voxceleb", + data_dir="dataset/slue-voxceleb", manifest_dir="manifest/slue-voxceleb", is_blind=True, ): diff --git a/slue_toolkit/prepare/prepare_voxceleb_asr_pred.py b/slue_toolkit/prepare/prepare_voxceleb_asr_pred.py index 79e38a3..cf2cd1d 100644 --- a/slue_toolkit/prepare/prepare_voxceleb_asr_pred.py +++ b/slue_toolkit/prepare/prepare_voxceleb_asr_pred.py @@ -18,7 +18,7 @@ def main(): "--pred-data", type=str, required=True, - default="datasets/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000", + default="dataset/slue-voxceleb/preds/vc1/w2v2-large-lv60k-ft-slue-vc1-12h-lr1e-5-s1-mt800000-8gpu-update280000", help="Root directory containing voxceleb1_slue data files," "This dir should contain audio/ voxceleb1_slue_{finetune,dev,test} folders ", ) diff --git a/slue_toolkit/prepare/prepare_voxpopuli.py b/slue_toolkit/prepare/prepare_voxpopuli.py index ff73493..7658828 100644 --- a/slue_toolkit/prepare/prepare_voxpopuli.py +++ b/slue_toolkit/prepare/prepare_voxpopuli.py @@ -30,7 +30,7 @@ def create_split( def create_manifest( - data_dir="datasets/slue-voxpopuli", + data_dir="dataset/slue-voxpopuli", manifest_dir="manifest/slue-voxpopuli", is_blind=True, ): diff --git a/slue_toolkit/text_ner/ner_deberta_modules.py b/slue_toolkit/text_ner/ner_deberta_modules.py index aec6fb7..66e45d5 100644 --- a/slue_toolkit/text_ner/ner_deberta_modules.py +++ b/slue_toolkit/text_ner/ner_deberta_modules.py @@ -1,4 +1,4 @@ -import logging, os, re +import logging, os, re, sys logger = logging.getLogger(__name__) import numpy as np @@ -65,9 +65,10 @@ def align_labels(self, tag2id, tags, encodings, label_all_tokens=False): Align labels with appropriate padding labels for sub-tokens label_all_tokens: Whether to put the label for one word on all tokens of generated by that word or just on the - one (in which case the other tokens will have a padding index). + one (in which case the other tokens will have a padding index). """ - labels = [[tag2id[tag] for tag in doc] for doc in tags] + #[ x if x%2 else x*100 for x in range(1, 10) ] + labels = [[tag2id[tag] if tag in tag2id else tag2id['O'] for tag in doc] for doc in tags] encoded_labels = [] for idx, doc_labels in enumerate(labels): word_ids = encodings.word_ids(batch_index=idx) @@ -228,7 +229,7 @@ def compute_metrics(p, return_entity_level_metrics=True): model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset - eval_dataset=eval_dataset, # evaluation dataset + eval_dataset=val_dataset, # evaluation dataset compute_metrics=compute_metrics, ) @@ -274,7 +275,7 @@ def compute_metrics(p, return_entity_level_metrics=True): if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() - metrics["eval_samples"] = len(eval_dataset) + metrics["eval_samples"] = len(val_dataset) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) From 021938422bc9e17f36dfc61345c343c31cdb118d Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Tue, 8 Feb 2022 22:05:11 -0500 Subject: [PATCH 4/6] fix formatting --- slue_toolkit/text_ner/ner_deberta_modules.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/slue_toolkit/text_ner/ner_deberta_modules.py b/slue_toolkit/text_ner/ner_deberta_modules.py index 66e45d5..c06f50c 100644 --- a/slue_toolkit/text_ner/ner_deberta_modules.py +++ b/slue_toolkit/text_ner/ner_deberta_modules.py @@ -65,9 +65,8 @@ def align_labels(self, tag2id, tags, encodings, label_all_tokens=False): Align labels with appropriate padding labels for sub-tokens label_all_tokens: Whether to put the label for one word on all tokens of generated by that word or just on the - one (in which case the other tokens will have a padding index). + one (in which case the other tokens will have a padding index). """ - #[ x if x%2 else x*100 for x in range(1, 10) ] labels = [[tag2id[tag] if tag in tag2id else tag2id['O'] for tag in doc] for doc in tags] encoded_labels = [] for idx, doc_labels in enumerate(labels): From 4a42759ae3b20debadf8a7bd2a2c0f60a4a5c6a1 Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Tue, 8 Feb 2022 23:18:58 -0500 Subject: [PATCH 5/6] fix text ner evaluation --- baselines/ner/nlp_scripts/eval-deberta.sh | 3 ++- slue_toolkit/text_ner/ner_deberta.py | 8 +++++--- slue_toolkit/text_ner/ner_deberta_modules.py | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/baselines/ner/nlp_scripts/eval-deberta.sh b/baselines/ner/nlp_scripts/eval-deberta.sh index d278fb7..4fdf950 100644 --- a/baselines/ner/nlp_scripts/eval-deberta.sh +++ b/baselines/ner/nlp_scripts/eval-deberta.sh @@ -8,4 +8,5 @@ python slue_toolkit/text_ner/ner_deberta.py eval \ --model_type $model_type \ --eval_asr False \ --eval_subset $eval_set \ ---eval_label $eval_label \ No newline at end of file +--eval_label $eval_label \ +--save_results True diff --git a/slue_toolkit/text_ner/ner_deberta.py b/slue_toolkit/text_ner/ner_deberta.py index 47420eb..aa30b45 100644 --- a/slue_toolkit/text_ner/ner_deberta.py +++ b/slue_toolkit/text_ner/ner_deberta.py @@ -35,8 +35,8 @@ def eval( log_dir = os.path.join(model_dir, "metrics") if save_results: ner_results_dir = os.path.join(log_dir, "error_analysis") + os.makedirs(ner_results_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) - os.makedirs(ner_results_dir, exist_ok=True) data_obj = NDM.DataSetup(data_dir, model_type) _ = data_obj.prep_data( @@ -50,14 +50,16 @@ def eval( if "combined" in eval_label: tag_lst = read_lst(os.path.join(data_dir, "combined_tag_lst_ordered")) - val_texts, val_tags, _, _, _, _ = data_obj.prep_data(eval_subset, tag2id=tag2id) + val_texts, val_tags, _, _, _ = data_obj.prep_data(eval_subset) if eval_asr: asr_val_texts, _, _, _, asr_val_dataset = data_obj.prep_data( f"{eval_subset}-{asr_model_type}-asr-{lm}", tag2id ) else: asr_val_texts, asr_val_dataset = None, None - eval_obj = NDM.Eval(model_dir, model_type, label_list, eval_label, eval_asr) + + label_list = read_lst(os.path.join(data_dir, f"{eval_label}_tag_lst_ordered")) + eval_obj = NDM.Eval(data_dir, model_dir, model_type, label_list, eval_label, eval_asr) for score_type in ["standard", "label"]: if eval_asr: res_fn = "-".join( diff --git a/slue_toolkit/text_ner/ner_deberta_modules.py b/slue_toolkit/text_ner/ner_deberta_modules.py index c06f50c..5e2ae5b 100644 --- a/slue_toolkit/text_ner/ner_deberta_modules.py +++ b/slue_toolkit/text_ner/ner_deberta_modules.py @@ -288,7 +288,7 @@ def __init__( """ self.data_dir = data_dir self.model_dir = model_dir - best_model_ckpt_dir = os.path.join(self.model_dir, "best-checkpoint") + best_model_ckpt_dir = os.path.join(self.model_dir) self.model = DebertaForTokenClassification.from_pretrained( best_model_ckpt_dir, output_loading_info=False ) @@ -434,7 +434,7 @@ def get_tag_map(self, indices=False, tag_names=False): return raw_to_combined_id elif tag_names: tag_map_dct = {"O": "O"} - for key, value in combined_tag_dct.items(): + for key, value in raw_to_combined_tag_map.items(): for item in value: for pfx in ["B-", "I-"]: if key != "DISCARD": From 52b5745f0df4bd6c9ce0c9d7cfd471955fbd1771 Mon Sep 17 00:00:00 2001 From: Siddharth Dalmia Date: Wed, 16 Feb 2022 11:16:10 -0500 Subject: [PATCH 6/6] fix merge conflicts --- slue_toolkit/text_ner/ner_deberta_modules.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/slue_toolkit/text_ner/ner_deberta_modules.py b/slue_toolkit/text_ner/ner_deberta_modules.py index bd32b83..4fdf41f 100644 --- a/slue_toolkit/text_ner/ner_deberta_modules.py +++ b/slue_toolkit/text_ner/ner_deberta_modules.py @@ -1,9 +1,5 @@ -<<<<<<< HEAD -import logging, os, re, sys -======= from curses import raw import logging, os, re ->>>>>>> upstream/main logger = logging.getLogger(__name__) import numpy as np