From d1f032d82bb3a3cde21db1d9efe19a3367024f38 Mon Sep 17 00:00:00 2001 From: Akshita Bhagia Date: Tue, 22 Dec 2020 14:00:00 -0800 Subject: [PATCH] Moving modelcard and taskcard abstractions to main repo (#4881) * moving modelcard and taskcard abstractions to main repo * CHANGELOG.md * fix changelog --- CHANGELOG.md | 1 + allennlp/common/model_card.py | 476 ++++++++++++++++++++++++++++++++ allennlp/common/task_card.py | 61 ++++ tests/common/model_card_test.py | 120 ++++++++ tests/common/task_card_test.py | 23 ++ 5 files changed, 681 insertions(+) create mode 100644 allennlp/common/model_card.py create mode 100644 allennlp/common/task_card.py create mode 100644 tests/common/model_card_test.py create mode 100644 tests/common/task_card_test.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 06512f99dc1..022d75be5f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added a new learning rate scheduler: `CombinedLearningRateScheduler`. This can be used to combine different LR schedulers, using one after the other. +- Moving `ModelCard` and `TaskCard` abstractions into the main repository. ### Changed diff --git a/allennlp/common/model_card.py b/allennlp/common/model_card.py new file mode 100644 index 00000000000..e8147c63f52 --- /dev/null +++ b/allennlp/common/model_card.py @@ -0,0 +1,476 @@ +""" +A specification for defining model cards as described in +[Model Cards for Model Reporting (Mitchell et al, 2019)] +(https://api.semanticscholar.org/CorpusID:52946140) + +The descriptions of the fields and some examples +are taken from the paper. + +The specification is provided to prompt model developers +to think about the various aspects that should ideally +be reported. The information filled should adhere to +the spirit of transparency rather than the letter; i.e., +it should not be filled for the sake of being filled. If +the information cannot be inferred, it should be left empty. +""" + +import os +import logging +from dataclasses import dataclass +from typing import Optional, Union, Dict, Any, Callable +from allennlp.common.from_params import FromParams + +from allennlp.models import Model +from allennlp.common.checks import ConfigurationError + +logger = logging.getLogger(__name__) + + +def get_description(model_class): + """ + Returns the model's description from the docstring. + """ + return model_class.__doc__.split("# Parameters")[0].strip() + + +class ModelCardInfo(FromParams): + def to_dict(self): + """ + Only the non-empty attributes are returned, to minimize empty values. + """ + info = {} + for key, val in self.__dict__.items(): + if val: + info[key] = val + return info + + def __str__(self): + display = "" + for key, val in self.to_dict().items(): + display += "\n" + key.replace("_", " ").capitalize() + ": " + display += "\n\t" + val.replace("\n", "\n\t") + "\n" + if not display: + display = super(ModelCardInfo, self).__str__() + return display.strip() + + +@dataclass(frozen=True) +class ModelDetails(ModelCardInfo): + """ + This provides the basic information about the model. + + # Parameters + + description : `str` + A high-level overview of the model. + Eg. The model implements a reading comprehension model patterned + after the proposed model in [Devlin et al, 2018] + (https://api.semanticscholar.org/CorpusID:52967399), with improvements + borrowed from the SQuAD model in the transformers project. + It predicts start tokens and end tokens with a linear layer on top of + word piece embeddings. + + short_description : `str` + A one-line description of the model. + Eg. A reading comprehension model patterned after RoBERTa, + with improvements borrowed from the SQuAD model in the transformers project. + + developed_by : `str` + Person/organization that developed the model. This can be used by all + stakeholders to infer details pertaining to model development and + potential conflicts of interest. + + contributed_by : `str` + Person that contributed the model to the repository. + + date : `str` + The date on which the model was contributed. This is useful for all + stakeholders to become further informed on what techniques and + data sources were likely to be available during model development. + Format example: 2020-09-23 + + version : `str` + The version of the model, and how it differs from previous versions. + This is useful for all stakeholders to track whether the model is the + latest version, associate known bugs to the correct model versions, + and aid in model comparisons. + + model_type : `str` + The type of the model; the basic architecture. This is likely to be + particularly relevant for software and model developers, as well as + individuals knowledgeable about machine learning, to highlight what + kinds of assumptions are encoded in the system. + Eg. Naive Bayes Classifier. + + paper : `str` + The paper on which the model is based. + Format example: + [Model Cards for Model Reporting (Mitchell et al, 2019)] + (https://api.semanticscholar.org/CorpusID:52946140) + + citation : `str` + The BibTex for the paper. + + license : `str` + License information for the model. + + contact : `str` + The email address to reach out to the relevant developers/contributors + for questions/feedback about the model. + + training_config : `str` + Link to training configuration. + """ + + description: Optional[str] = None + short_description: Optional[str] = None + developed_by: Optional[str] = None + contributed_by: Optional[str] = None + date: Optional[str] = None + version: Optional[str] = None + model_type: Optional[str] = None + paper: Optional[str] = None + citation: Optional[str] = None + license: Optional[str] = None + contact: Optional[str] = None + training_config: Optional[str] = None + + +@dataclass(frozen=True) +class IntendedUse(ModelCardInfo): + """ + This determines what the model should and should not be used for. + + # Parameters + + primary_uses : `str` + Details the primary intended uses of the model; whether it was developed + for general or specific tasks. + Eg. The toxic text identifier model was developed to identify + toxic comments on online platforms. An example use case is + to provide feedback to comment authors. + + primary_users : `str` + The primary intended users. For example, was the model developed + for entertainment purposes, for hobbyists, or enterprise solutions? + This helps users gain insight into how robust the model may be to + different kinds of inputs. + + out_of_scope_use_cases : `str` + Highlights the technology that the model might easily be confused with, + or related contexts that users could try to apply the model to. + Eg. the toxic text identifier model is not intended for fully automated + moderation, or to make judgements about specific individuals. + + Also recommends a related or similar model that was designed to better + meet a particular need, where possible. + Eg. not for use on text examples longer than 100 tokens; please use + the bigger-toxic-text-identifier instead. + """ + + primary_uses: Optional[str] = None + primary_users: Optional[str] = None + out_of_scope_use_cases: Optional[str] = None + + +@dataclass(frozen=True) +class Factors(ModelCardInfo): + """ + This provides a summary of relevant factors such as + demographics, instrumentation used, etc. for which the + model performance may vary. + + # Parameters + + relevant_factors : `str` + The foreseeable salient factors for which model performance may vary, + and how these were determined. + Eg. the model performance may vary for variations in dialects of English. + + evaluation_factors : `str` + Mentions the factors that are being reported, and the reasons for why + they were chosen. Also includes the reasons for choosing different + evaluation factors than relevant factors. + + Eg. While dialect variation is a relevant factor, + dialect-specific annotations were not available, and hence, the + performance was not evaluated on different dialects. + """ + + relevant_factors: Optional[str] = None + evaluation_factors: Optional[str] = None + + +@dataclass(frozen=True) +class Metrics(ModelCardInfo): + """ + This lists the reported metrics and the reasons + for choosing them. + + # Parameters + + model_performance_measures : `str` + Which model performance measures were selected and the reasons for + selecting them. + decision_thresholds : `str` + If decision thresholds are used, what are they, and the reasons for + choosing them. + variation_approaches : `str` + How are the measurements and estimations of these metrics calculated? + Eg. standard deviation, variance, confidence intervals, KL divergence. + Details of how these values are approximated should also be included. + Eg. average of 5 runs, 10-fold cross-validation, etc. + """ + + model_performance_measures: Optional[str] = None + decision_thresholds: Optional[str] = None + variation_approaches: Optional[str] = None + + +@dataclass(frozen=True) +class EvaluationData(ModelCardInfo): + """ + This provides information about the evaluation data. + + # Parameters + + dataset : `str` + The name(s) (and link(s), if available) of the dataset(s) used to evaluate + the model. Optionally, provide a link to the relevant datasheet(s) as well. + motivation : `str` + The reasons for selecting the dataset(s). + Eg. For the BERT model, document-level corpora were used rather than a + shuffled sentence-level corpus in order to extract long contiguous sequences. + preprocessing : `str` + How was the data preprocessed for evaluation? + Eg. tokenization of sentences, filtering of paragraphs by length, etc. + """ + + dataset: Optional[str] = None + motivation: Optional[str] = None + preprocessing: Optional[str] = None + + def to_dict(self): + info = {} + for key, val in self.__dict__.items(): + if val: + info["evaluation_" + key] = val + return info + + +@dataclass(frozen=True) +class TrainingData(ModelCardInfo): + """ + This provides information about the training data. If the model was initialized + from pretrained weights, a link to the pretrained model's model card/training + data can additionally be provided, if available. Any relevant definitions should + also be included. + + # Parameters + + dataset : `str` + The name(s) (and link(s), if available) of the dataset(s) used to train + the model. Optionally, provide a link to the relevant datasheet(s) as well. + Eg. * Proprietary data from Perspective API; includes comments from online + forums such as Wikipedia and New York Times, with crowdsourced labels of + whether the comment is "toxic". + * "Toxic" is defined as "a rude, disrespectful, or unreasonable comment + that is likely to make you leave a discussion." + motivation : `str` + The reasons for selecting the dataset(s). + Eg. For the BERT model, document-level corpora were used rather than a + shuffled sentence-level corpus in order to extract long contiguous sequences. + preprocessing : `str` + Eg. Only the text passages were extracted from English Wikipedia; lists, tables, + and headers were ignored. + """ + + dataset: Optional[str] = None + motivation: Optional[str] = None + preprocessing: Optional[str] = None + + def to_dict(self): + info = {} + for key, val in self.__dict__.items(): + if val: + info["training_" + key] = val + return info + + +@dataclass(frozen=True) +class QuantitativeAnalyses(ModelCardInfo): + """ + This provides disaggregated evaluation of how the + model performed based on chosen metrics, with confidence + intervals, if possible. Links to plots/figures showing + the metrics can also be provided. + + # Parameters + + unitary_results : `str` + The performance of the model with respect to each chosen + factor. + intersectional_results : `str` + The performance of the model with respect to the intersection + of the evaluated factors. + """ + + unitary_results: Optional[str] = None + intersectional_results: Optional[str] = None + + +@dataclass(frozen=True) +class EthicalConsiderations(ModelCardInfo): + """ + This highlights any ethical considerations to keep + in mind when using the model. + Eg. Is the model intended to be used for informing + decisions on human life? Does it use sensitive data? + What kind of risks are possible, and what mitigation + strategies were used to address them? + Eg. The model does not take into account user history + when making judgments about toxicity, due to privacy + concerns. + """ + + ethical_considerations: Optional[str] = None + + +@dataclass(frozen=True) +class CaveatsAndRecommendations(ModelCardInfo): + """ + This lists any additional concerns. For instance, were any + relevant groups not present in the evaluation data? + Eg. The evaluation data is synthetically designed to be + representative of common use cases and concerns, but + may not be comprehensive. + """ + + caveats_and_recommendations: Optional[str] = None + + +class ModelCard(ModelCardInfo): + """ + The model card stores the recommended attributes for model reporting. + + # Parameters + + id : `str` + Model's id, following the convention of task-model-relevant-details. + Example: rc-bidaf-elmo for a reading comprehension BiDAF model using ELMo embeddings. + registered_model_name : `str`, optional + The model's registered name. If `model_class` is not given, this will be used + to find any available `Model` registered with this name. + model_class : `type`, optional + If given, the `ModelCard` will pull some default information from the class. + registered_predictor_name : `str`, optional + The registered name of the corresponding predictor. + display_name : `str`, optional + The pretrained model's display name. + archive_file : `str`, optional + The location of model's pretrained weights. + overrides : `Dict`, optional + Optional overrides for the model's architecture. + model_details : `Union[ModelDetails, str]`, optional + intended_use : `Union[IntendedUse, str]`, optional + factors : `Union[Factors, str]`, optional + metrics : `Union[Metrics, str]`, optional + evaluation_data : `Union[EvaluationData, str]`, optional + quantitative_analyses : `Union[QuantitativeAnalyses, str]`, optional + ethical_considerations : `Union[EthicalConsiderations, str]`, optional + caveats_and_recommendations : `Union[CaveatsAndRecommendations, str]`, optional + + !!! Note + For all the fields that are `Union[ModelCardInfo, str]`, a `str` input will be + treated as the first argument of the relevant constructor. + + """ + + _storage_location = "https://storage.googleapis.com/allennlp-public-models/" + + def __init__( + self, + id: str, + registered_model_name: Optional[str] = None, + model_class: Optional[Callable[..., Model]] = None, + registered_predictor_name: Optional[str] = None, + display_name: Optional[str] = None, + task_id: Optional[str] = None, + archive_file: Optional[str] = None, + overrides: Optional[Dict] = None, + model_details: Optional[Union[str, ModelDetails]] = None, + intended_use: Optional[Union[str, IntendedUse]] = None, + factors: Optional[Union[str, Factors]] = None, + metrics: Optional[Union[str, Metrics]] = None, + evaluation_data: Optional[Union[str, EvaluationData]] = None, + training_data: Optional[Union[str, TrainingData]] = None, + quantitative_analyses: Optional[Union[str, QuantitativeAnalyses]] = None, + ethical_considerations: Optional[Union[str, EthicalConsiderations]] = None, + caveats_and_recommendations: Optional[Union[str, CaveatsAndRecommendations]] = None, + ): + + assert id + if not model_class and registered_model_name: + try: + model_class = Model.by_name(registered_model_name) + except ConfigurationError: + logger.warning("{} is not a registered model.".format(registered_model_name)) + + if model_class: + display_name = display_name or model_class.__name__ + model_details = model_details or get_description(model_class) + if not registered_predictor_name: + registered_predictor_name = model_class.default_predictor # type: ignore + + if archive_file and not archive_file.startswith("https:"): + archive_file = os.path.join(self._storage_location, archive_file) + + if isinstance(model_details, str): + model_details = ModelDetails(description=model_details) + if isinstance(intended_use, str): + intended_use = IntendedUse(primary_uses=intended_use) + if isinstance(factors, str): + factors = Factors(relevant_factors=factors) + if isinstance(metrics, str): + metrics = Metrics(model_performance_measures=metrics) + if isinstance(evaluation_data, str): + evaluation_data = EvaluationData(dataset=evaluation_data) + if isinstance(training_data, str): + training_data = TrainingData(dataset=training_data) + if isinstance(quantitative_analyses, str): + quantitative_analyses = QuantitativeAnalyses(unitary_results=quantitative_analyses) + if isinstance(ethical_considerations, str): + ethical_considerations = EthicalConsiderations(ethical_considerations) + if isinstance(caveats_and_recommendations, str): + caveats_and_recommendations = CaveatsAndRecommendations(caveats_and_recommendations) + + self.id = id + self.registered_model_name = registered_model_name + self.registered_predictor_name = registered_predictor_name + self.display_name = display_name + self.task_id = task_id + self.archive_file = archive_file + self.model_details = model_details + self.intended_use = intended_use + self.factors = factors + self.metrics = metrics + self.evaluation_data = evaluation_data + self.training_data = training_data + self.quantitative_analyses = quantitative_analyses + self.ethical_considerations = ethical_considerations + self.caveats_and_recommendations = caveats_and_recommendations + + def to_dict(self) -> Dict[str, Any]: + """ + Converts the `ModelCard` to a flat dictionary object. This can be converted to + json and passed to any front-end. + """ + info = {} + for key, val in self.__dict__.items(): + if key != "id": + if isinstance(val, ModelCardInfo): + info.update(val.to_dict()) + else: + if val is not None: + info[key] = val + return info diff --git a/allennlp/common/task_card.py b/allennlp/common/task_card.py new file mode 100644 index 00000000000..aa85f7bce87 --- /dev/null +++ b/allennlp/common/task_card.py @@ -0,0 +1,61 @@ +""" +A specification for defining task cards (derived from model cards). +Motivation: A model's capabilities and limitations are dependent on +the task definition. Thus, it is helpful to separate the information +in the model card that comes from specifically the task itself. +""" + +from typing import Dict, List, Optional, Union +from dataclasses import dataclass + +from allennlp.common.from_params import FromParams + + +@dataclass(frozen=True) +class TaskCard(FromParams): + """ + The `TaskCard` stores information about the task. It is modeled after the + `ModelCard`. + + # Parameters + + id : `str` + The task id. + Example: `"rc"` for reading comprehension. + name : `str`, optional + The (display) name of the task. + description : `str`, optional + Description of the task. + Example: "Textual Entailment (TE) is the task of predicting whether, + for a pair of sentences, the facts in the first sentence necessarily + imply the facts in the second." + expected_inputs : `str`, optional + All expected inputs and their format. + Example: (For a reading comprehension task) + Passage (text string), Question (text string) + expected_outputs : `str`, optional + All expected outputs and their format. + Example: (For a reading comprehension task) + Answer span (start token position and end token position). + examples : `Union[List[Dict[str, str]], Dict[str, List[Dict[str, str]]]]`, optional + List of examples for the task. Each example dict should contain as keys the + `expected_inputs`. + Example: (For textual entailment) + [{"premise": "A handmade djembe was on display at the Smithsonian.", + "hypothesis": "Visitors could see the djembe."}] + scope_and_limitations: `str`, optional + This discusses the scope of the task based on how it is defined, and any limitations. + Example: "The Textual Entailment task is in some sense "NLP-complete", and you + should not expect any current model to cover every possible aspect of + entailment. Instead, you should think about what the model was trained + on to see whether it could reasonably capture the phenomena that you + are querying it with." + """ + + id: str + name: Optional[str] = None + description: Optional[str] = None + expected_inputs: Optional[str] = None + expected_outputs: Optional[str] = None + scope_and_limitations: Optional[str] = None + examples: Optional[Union[List[Dict[str, str]], Dict[str, List[Dict[str, str]]]]] = None diff --git a/tests/common/model_card_test.py b/tests/common/model_card_test.py new file mode 100644 index 00000000000..0f71e1e4ac5 --- /dev/null +++ b/tests/common/model_card_test.py @@ -0,0 +1,120 @@ +from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.model_card import ModelCard, IntendedUse +from allennlp.models import Model + + +class TestPretrainedModelConfiguration(AllenNlpTestCase): + def test_init(self): + model_card = ModelCard( + id="fake_name", + display_name="Fake Name", + model_details="Model's description", + archive_file="fake.tar.gz", + overrides={}, + ) + + assert model_card.id == "fake_name" + assert model_card.display_name == "Fake Name" + assert model_card.archive_file == ModelCard._storage_location + "fake.tar.gz" + assert model_card.model_details.description == "Model's description" + + def test_init_registered_model(self): + @Model.register("fake-model") + class FakeModel(Model): + """ + This is a fake model with a docstring. + + # Parameters + + fake_param1: str + fake_param2: int + """ + + def forward(self, **kwargs): + return {} + + model_card = ModelCard(**{"id": "this-fake-model", "registered_model_name": "fake-model"}) + + assert model_card.display_name == "FakeModel" + assert model_card.model_details.description == "This is a fake model with a docstring." + + def test_init_dict_model(self): + class FakeModel(Model): + """ + This is a fake model with a docstring. + + # Parameters + + fake_param1: str + fake_param2: int + """ + + def forward(self, **kwargs): + return {} + + model_card = ModelCard(**{"id": "this-fake-model", "model_class": FakeModel}) + + assert model_card.display_name == "FakeModel" + assert model_card.model_details.description == "This is a fake model with a docstring." + + def test_init_registered_model_override(self): + @Model.register("fake-model-2") + class FakeModel(Model): + """ + This is a fake model with a docstring. + + # Parameters + + fake_param1: str + fake_param2: int + """ + + def forward(self, **kwargs): + return {} + + model_card = ModelCard( + **{ + "id": "this-fake-model", + "registered_model_name": "fake-model-2", + "model_details": "This is the fake model trained on a dataset.", + "model_class": FakeModel, + } + ) + + assert ( + model_card.model_details.description == "This is the fake model trained on a dataset." + ) + + def test_init_model_card_info_obj(self): + @Model.register("fake-model-3") + class FakeModel(Model): + """ + This is a fake model with a docstring. + + # Parameters + + fake_param1: str + fake_param2: int + """ + + def forward(self, **kwargs): + return {} + + intended_use = IntendedUse("Use 1", "User 1") + + model_card = ModelCard( + **{ + "id": "this-fake-model", + "registered_model_name": "fake-model-3", + "intended_use": intended_use, + } + ) + + model_card_dict = model_card.to_dict() + assert model_card.display_name == "FakeModel" + + for key, val in intended_use.__dict__.items(): + if val: + assert key in model_card_dict + else: + assert key not in model_card_dict diff --git a/tests/common/task_card_test.py b/tests/common/task_card_test.py new file mode 100644 index 00000000000..7a9b4a473aa --- /dev/null +++ b/tests/common/task_card_test.py @@ -0,0 +1,23 @@ +from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.task_card import TaskCard + + +class TestTaskCard(AllenNlpTestCase): + def test_init(self): + task_card = TaskCard( + id="fake_name", + name="Fake Name", + description="Task's description", + expected_inputs="Passage (text string), Question (text string)", + expected_outputs="Answer span (start token position and end token position).", + examples=[ + { + "premise": "A handmade djembe was on display at the Smithsonian.", + "hypothesis": "Visitors could see the djembe.", + } + ], + ) + + assert task_card.id == "fake_name" + assert task_card.name == "Fake Name" + assert task_card.expected_inputs == "Passage (text string), Question (text string)"