bigscience-workshop · VictorSanh · Sep 21, 2021 · Sep 20, 2021 · Sep 20, 2021 · Sep 20, 2021
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Locked files
+*.lock
diff --git a/promptsource/custom_datasets/story_cloze/dataset_infos.json b/promptsource/custom_datasets/story_cloze/dataset_infos.json
@@ -0,0 +1 @@
+{"2016": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2016", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 614084, "num_examples": 1871, "dataset_name": "story_cloze"}, "test": {"name": "test", "num_bytes": 613184, "num_examples": 1871, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 1227268, "size_in_bytes": 1227268}, "2018": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2018", "version": "0.0.0", "splits": {"validation": {"name": "validation", "num_bytes": 515439, "num_examples": 1571, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 515439, "size_in_bytes": 515439}}
diff --git a/promptsource/custom_datasets/story_cloze/story_cloze.py b/promptsource/custom_datasets/story_cloze/story_cloze.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Story Cloze datasets."""
+
+
+import csv
+import os
+
+import datasets
+
+
+_DESCRIPTION = """
+Story Cloze Test' is a commonsense reasoning framework for evaluating story understanding,
+story generation, and script learning.This test requires a system to choose the correct ending
+to a four-sentence story.
+"""
+
+_CITATION = """\
+@inproceedings{mostafazadeh2017lsdsem,
+  title={Lsdsem 2017 shared task: The story cloze test},
+  author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},
+  booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics},
+  pages={46--51},
+  year={2017}
+}
+"""
+
+
+class StoryCloze(datasets.GeneratorBasedBuilder):
+    """."""
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="2016", description="Story Cloze Test Spring 2016 set"),
+        datasets.BuilderConfig(name="2018", description="Story Cloze Test Winter 2018 set"),
+    ]
+
+    @property
+    def manual_download_instructions(self):
+        return (
+            "To use Sotry Cloze you have to download it manually. Please fill this "
+            "google form (http://goo.gl/forms/aQz39sdDrO). complete the form. "
+            "Then you will recieve a a download link for the dataset. Load it using : "
+            "`datasets.load_dataset('story_cloze', data_dir='path/to/folder/folder_name')`"
+        )
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "story_id": datasets.Value("string"),
+                    "input_sentence_1": datasets.Value("string"),
+                    "input_sentence_2": datasets.Value("string"),
+                    "input_sentence_3": datasets.Value("string"),
+                    "input_sentence_4": datasets.Value("string"),
+                    "sentence_quiz1": datasets.Value("string"),
+                    "sentence_quiz2": datasets.Value("string"),
+                    "answer_right_ending": datasets.Value("int32"),
+                }
+            ),
+            homepage="https://cs.rochester.edu/nlp/rocstories/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        path_to_manual_folder = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
+        if self.config.name == "2016":
+            test_file = os.path.join(path_to_manual_folder, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
+            val_file = os.path.join(path_to_manual_folder, "cloze_test_val__spring2016 - cloze_test_ALL_val.csv")
+            return [
+                datasets.SplitGenerator(
+                    name=datasets.Split.VALIDATION,
+                    gen_kwargs={
+                        "filepath": val_file,
+                    },
+                ),
+                datasets.SplitGenerator(
+                    name=datasets.Split.TEST,
+                    gen_kwargs={
+                        "filepath": test_file,
+                    },
+                ),
+            ]
+
+        else:
+            val_file = os.path.join(path_to_manual_folder, "cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
+
+            return [
+                datasets.SplitGenerator(
+                    name=datasets.Split.VALIDATION,
+                    gen_kwargs={
+                        "filepath": val_file,
+                    },
+                ),
+            ]
+
+    def _generate_examples(self, filepath):
+        """Generate Eduge news examples."""
+        with open(filepath, encoding="utf-8") as csv_file:
+            csv_reader = csv.reader(
+                csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
+            )
+            _ = next(csv_reader)
+            for id_, row in enumerate(csv_reader):
+                if row and len(row) == 8:
+                    yield row[0], {
+                        "story_id": row[0],
+                        "input_sentence_1": row[1],
+                        "input_sentence_2": row[2],
+                        "input_sentence_3": row[3],
+                        "input_sentence_4": row[4],
+                        "sentence_quiz1": row[5],
+                        "sentence_quiz2": row[6],
+                        "answer_right_ending": int(row[7]),
+                    }
diff --git a/promptsource/seqio_tasks/utils.py b/promptsource/seqio_tasks/utils.py
@@ -1,6 +1,7 @@
 import re
 
 import datasets
+import pkg_resources
 import tensorflow as tf
 
 import promptsource.utils
@@ -60,6 +61,13 @@ def filter_fn(ex):
 
 
 def get_dataset_splits(dataset_name, subset_name=None):
+    # `datasets.get_dataset_infos` pulls infos from hf/datasets's master.
+    # story_cloze hasn't been merged yet (https://github.com/huggingface/datasets/pull/2907)
+    # This is a temporary fix to be able to do `import promptsource.seqio_tasks`
+    # Once PR 2907 is merged, we can remove this if condition (along with the `custom_datasets` folder)
+    # Also see `promptsource.utils.get_dataset_builder`
+    if dataset_name == "story_cloze":
+        dataset_name = pkg_resources.resource_filename("promptsource", "custom_datasets/story_cloze")
     info = datasets.get_dataset_infos(dataset_name)
     subset_name = subset_name or list(info.keys())[0]
     return info[subset_name].splits

diff --git a/promptsource/templates/story_cloze/2016/templates.yaml b/promptsource/templates/story_cloze/2016/templates.yaml
@@ -0,0 +1,112 @@
+dataset: story_cloze
+subset: '2016'
+templates:
+  1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8: !Template
+    answer_choices: null
+    answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
+    id: 1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8
+    jinja: '{{input_sentence_1}}
+      {{input_sentence_2}}
+      {{input_sentence_3}}
+      {{input_sentence_4}}
+      What is a possible continuation for the story given the following options ?
+      - {{answer_choices | join("\n- ")}}
+      |||
+      {{answer_choices[answer_right_ending -1]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Answer Given options
+    reference: ''
+  1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1: !Template
+    answer_choices: null
+    answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
+    id: 1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1
+    jinja: "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n\
+      {{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the\
+      \ previous story from the following options: \n- {{answer_choices | join(\"\\\
+      n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Choose Story Ending
+    reference: ''
+  9dab69d1-cad0-4d2f-a7cc-120df233571c: !Template
+    answer_choices: null
+    answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
+    id: 9dab69d1-cad0-4d2f-a7cc-120df233571c
+    jinja: 'Yesterday, I watched a movie. Here''s what happened:
+      {{input_sentence_1}}
+      {{input_sentence_2}}
+      {{input_sentence_3}}
+      {{input_sentence_4}}
+      What happens next?
+      - {{answer_choices | join("\n- ")}}
+      |||
+      {{answer_choices[answer_right_ending -1]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Movie What Happens Next
+    reference: ''
+  b5c8445f-2d3a-4691-bdd5-58956816702f: !Template
+    answer_choices: null
+    answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
+    id: b5c8445f-2d3a-4691-bdd5-58956816702f
+    jinja: "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n\
+      {{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from\
+      \ the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending\
+      \ -1]}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Story Continuation and Options
+    reference: ''
+  baffa716-43cf-4954-a35c-655d775321e6: !Template
+    answer_choices: null
+    answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
+    id: baffa716-43cf-4954-a35c-655d775321e6
+    jinja: 'Generate a possible ending for the following story:
+      {{input_sentence_1}}
+      {{input_sentence_2}}
+      {{input_sentence_3}}
+      {{input_sentence_4}}
+      |||
+      {{answer_choices[answer_right_ending -1]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      metrics:
+      - BLEU
+      - ROUGE
+      original_task: false
+    name: Generate Ending
+    reference: ''
+  c6f3d802-4f97-449f-a911-03470d418f7d: !Template
+    answer_choices: null
+    answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
+    id: c6f3d802-4f97-449f-a911-03470d418f7d
+    jinja: 'I read the following novel:
+      {{input_sentence_1}}
+      {{input_sentence_2}}
+      {{input_sentence_3}}
+      {{input_sentence_4}}
+      What do you think is the most probable ending? You can choose from the following
+      options:
+      - {{answer_choices | join("\n- ")}}
+      |||
+      {{answer_choices[answer_right_ending -1]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Novel Correct Ending
+    reference: ''
diff --git a/promptsource/utils.py b/promptsource/utils.py
@@ -1,6 +1,7 @@
 # coding=utf-8
 
 import datasets
+import pkg_resources
 import requests
 
 
@@ -31,6 +32,13 @@ def renameDatasetColumn(dataset):
 
 def get_dataset_builder(path, conf=None):
     "Get a dataset builder from name and conf."
+    # `datasets.load.prepare_module` pulls infos from hf/datasets's master.
+    # story_cloze hasn't been merged yet (https://github.com/huggingface/datasets/pull/2907)
+    # This is a temporary fix for the tests (more specifically test_templates.py)
+    # Once PR 2907 is merged, we can remove this if condition (along with the `custom_datasets` folder)
+    # Also see `promptsource.seqio_tasks.utils.get_dataset_splits`
+    if path == "story_cloze":
+        path = pkg_resources.resource_filename("promptsource", "custom_datasets/story_cloze")
     module_path = datasets.load.prepare_module(path, dataset=True)
     builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
     if conf:

diff --git a/setup.py b/setup.py
@@ -27,5 +27,6 @@
         "templates/*/*/*.yaml",
         "seqio_tasks/experiment_D3.csv",  # Experiment D3
         "seqio_tasks/experiment_D4.csv",
+        "custom_datasets/*/*"
     ]}
 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -133,3 +133,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # Locked files
+    *.lock
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"2016": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2016", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 614084, "num_examples": 1871, "dataset_name": "story_cloze"}, "test": {"name": "test", "num_bytes": 613184, "num_examples": 1871, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 1227268, "size_in_bytes": 1227268}, "2018": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2018", "version": "0.0.0", "splits": {"validation": {"name": "validation", "num_bytes": 515439, "num_examples": 1571, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 515439, "size_in_bytes": 515439}}