bigscience-workshop · aneveol · May 22, 2022 · Jul 3, 2022 · Jul 3, 2022 · Jul 8, 2022
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+assets/PromptSource[[:space:]]ACL[[:space:]]Demo[[:space:]]Figure.png filter=lfs diff=lfs merge=lfs -text
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -29,7 +29,7 @@ You can always update the name later. If you want to cancel the prompt, select
 1. **Write the prompt**. In the box labeled "Template," enter a Jinja expression.
 See the [getting started guide](#getting-started-using-jinja-to-write-prompts)
 and [cookbook](#jinja-cookbook) for details on how to write templates.
-1. **Fill in metadata**. Fill in the metadata for the current prompt: reference, original task, choices in templates, and answer choices.
+1. **Fill in metadata**. Fill in the metadata for the current prompt: reference, original task, choices in templates, metrics, languages, and answer choices.
 See [Metadata](#metadata) for more details about these fields.
 1. **Save the prompt**. Hit the "Save" button. The output of the prompt
 applied to the current example will appear in the right sidebar.
@@ -124,6 +124,7 @@ to generate a question for a given answer would not.
 the options for the possible outputs (regardless of whether `answer_choices` is used).
 * **Metrics.** Use the multiselect widget to select all metrics commonly used to evaluate
 this task. Choose “Other” if there is one that is not included in the list.
+* **Languages.** Use the multiselect widget to select all languages used in the prompt. This is independent of what languages are used in the underlying dataset. For example, you could have an English prompt for a Spanish dataset.
 * **Answer Choices.**  If the prompt has a small set of possible outputs (e.g., Yes/No,
 class labels, entailment judgements, etc.), then the prompt should define and use answer
 choices as follows. This allows evaluation to consider just the possible targets for

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ PromptSource provides the tools to create, and share natural language prompts (s
 Question: Does this imply that "{{hypothesis}}"? Yes, no, or maybe? ||| {{answer_choices[label]}}
 ```
 
-**You can browse through existing prompts on the [hosted version of PromptSource](https://bigscience.huggingface.co/promptsource).**
+**You can browse through existing prompts on the [hosted version of PromptSource](https://huggingface.co/spaces/bigscience/promptsource).**
 
 ## Setup
 If you do not intend to modify prompts, you can simply run:

diff --git a/assets/PromptSource ACL Demo Figure.png b/assets/PromptSource ACL Demo Figure.png
diff --git a/assets/promptsource_app.png b/assets/promptsource_app.png
diff --git a/promptsource/__init__.py b/promptsource/__init__.py
@@ -1 +1,4 @@
-DEFAULT_PROMPTSOURCE_CACHE_HOME = "~/.cache/promptsource"
+from pathlib import Path
+
+
+DEFAULT_PROMPTSOURCE_CACHE_HOME = str(Path("~/.cache/promptsource").expanduser())
diff --git a/promptsource/app.py b/promptsource/app.py
@@ -1,20 +1,23 @@
 import argparse
 import functools
 import multiprocessing
+import os
 import textwrap
+from hashlib import sha256
 from multiprocessing import Manager, Pool
 
 import pandas as pd
 import plotly.express as px
 import streamlit as st
 from datasets import get_dataset_infos
+from datasets.info import DatasetInfosDict
 from pygments import highlight
 from pygments.formatters import HtmlFormatter
 from pygments.lexers import DjangoLexer
-from templates import INCLUDED_USERS
 
+from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME
 from promptsource.session import _get_state
-from promptsource.templates import DatasetTemplates, Template, TemplateCollection
+from promptsource.templates import INCLUDED_USERS, LANGUAGES, METRICS, DatasetTemplates, Template, TemplateCollection
 from promptsource.utils import (
     get_dataset,
     get_dataset_confs,
@@ -25,6 +28,9 @@
 )
 
 
+DATASET_INFOS_CACHE_DIR = os.path.join(DEFAULT_PROMPTSOURCE_CACHE_HOME, "DATASET_INFOS")
+os.makedirs(DATASET_INFOS_CACHE_DIR, exist_ok=True)
+
 # Python 3.8 switched the default start method from fork to spawn. OS X also has
 # some issues related to fork, eee, e.g., https://github.com/bigscience-workshop/promptsource/issues/572
 # so we make sure we always use spawn for consistency
@@ -38,7 +44,28 @@ def get_infos(all_infos, d_name):
     :param all_infos: multiprocess-safe dictionary
     :param d_name: dataset name
     """
-    all_infos[d_name] = get_dataset_infos(d_name)
+    d_name_bytes = d_name.encode("utf-8")
+    d_name_hash = sha256(d_name_bytes)
+    foldername = os.path.join(DATASET_INFOS_CACHE_DIR, d_name_hash.hexdigest())
+    if os.path.isdir(foldername):
+        infos_dict = DatasetInfosDict.from_directory(foldername)
+    else:
+        infos = get_dataset_infos(d_name)
+        infos_dict = DatasetInfosDict(infos)
+        os.makedirs(foldername)
+        infos_dict.write_to_directory(foldername)
+    all_infos[d_name] = infos_dict
+
+
+def format_language(tag):
+    """
+    Formats a language tag for display in the UI.
+
+    For example, if the tag is "en", then the function returns "en (English)"
+    :param tag: language tag
+    :return: formatted language name
+    """
+    return tag + " (" + LANGUAGES[tag] + ")"
 
 
 # add an argument for read-only
@@ -181,11 +208,13 @@ def show_text(t, width=WIDTH, with_markdown=False):
                 else:
                     subset_infos = infos[subset_name]
 
-                split_sizes = {k: v.num_examples for k, v in subset_infos.splits.items()}
+                try:
+                    split_sizes = {k: v.num_examples for k, v in subset_infos.splits.items()}
+                except Exception:
+                    # Fixing bug in some community datasets.
+                    # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0.
+                    split_sizes = {}
             else:
-                # Zaid/coqa_expanded and Zaid/quac_expanded don't have dataset_infos.json
-                # so infos is an empty dic, and `infos[list(infos.keys())[0]]` raises an error
-                # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0.
                 split_sizes = {}
 
             # Collect template counts, original task counts and names
@@ -284,13 +313,18 @@ def show_text(t, width=WIDTH, with_markdown=False):
             except OSError as e:
                 st.error(
                     f"Some datasets are not handled automatically by `datasets` and require users to download the "
-                    f"dataset manually. This applies to {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. "
-                    f"\n\nPlease download the raw dataset to `~/.cache/promptsource/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`. "
+                    f"dataset manually. It is possibly the case for {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. "
+                    f"\n\nIf so, please download the raw dataset to `~/.cache/promptsource/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`. "
                     f"\n\nYou can choose another cache directory by overriding `PROMPTSOURCE_MANUAL_DATASET_DIR` environment "
                     f"variable and downloading raw dataset to `$PROMPTSOURCE_MANUAL_DATASET_DIR/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`"
                     f"\n\nOriginal error:\n{str(e)}"
                 )
                 st.stop()
+            except Exception as e:
+                st.error(
+                    f"An error occured while loading the dataset {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. "
+                    f"\\n\nOriginal error:\n{str(e)}"
+                )
 
             splits = list(dataset.keys())
             index = 0
@@ -403,6 +437,11 @@ def show_text(t, width=WIDTH, with_markdown=False):
                     st.text(template.metadata.choices_in_prompt)
                     st.markdown("##### Metrics")
                     st.text(", ".join(template.metadata.metrics) if template.metadata.metrics else None)
+                    st.markdown("##### Prompt Languages")
+                    if template.metadata.languages:
+                        st.text(", ".join([format_language(tag) for tag in template.metadata.languages]))
+                    else:
+                        st.text(None)
                     st.markdown("##### Answer Choices")
                     if template.get_answer_choices_expr() is not None:
                         show_jinja(template.get_answer_choices_expr())
@@ -539,35 +578,24 @@ def show_text(t, width=WIDTH, with_markdown=False):
                                 help="Prompt explicitly lists choices in the template for the output.",
                             )
 
-                            # Metrics from here:
-                            # https://github.com/google-research/text-to-text-transfer-transformer/blob/4b580f23968c2139be7fb1cd53b22c7a7f686cdf/t5/evaluation/metrics.py
-                            metrics_choices = [
-                                "BLEU",
-                                "ROUGE",
-                                "Squad",
-                                "Trivia QA",
-                                "Accuracy",
-                                "Pearson Correlation",
-                                "Spearman Correlation",
-                                "MultiRC",
-                                "AUC",
-                                "COQA F1",
-                                "Edit Distance",
-                            ]
-                            # Add mean reciprocal rank
-                            metrics_choices.append("Mean Reciprocal Rank")
-                            # Add generic other
-                            metrics_choices.append("Other")
-                            # Sort alphabetically
-                            metrics_choices = sorted(metrics_choices)
                             state.metadata.metrics = st.multiselect(
                                 "Metrics",
-                                metrics_choices,
+                                sorted(METRICS),
                                 default=template.metadata.metrics,
                                 help="Select all metrics that are commonly used (or should "
                                 "be used if a new task) to evaluate this prompt.",
                             )
 
+                            state.metadata.languages = st.multiselect(
+                                "Prompt Languages",
+                                sorted(LANGUAGES.keys()),
+                                default=template.metadata.languages,
+                                format_func=format_language,
+                                help="Select all languages used in this prompt. "
+                                "This annotation is independent from the language(s) "
+                                "of the dataset.",
+                            )
+
                             # Answer choices
                             if template.get_answer_choices_expr() is not None:
                                 answer_choices = template.get_answer_choices_expr()