Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,6 @@ dmypy.json

# Pyre type checker
.pyre/

# Locked files
*.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"2016": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2016", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 614084, "num_examples": 1871, "dataset_name": "story_cloze"}, "test": {"name": "test", "num_bytes": 613184, "num_examples": 1871, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 1227268, "size_in_bytes": 1227268}, "2018": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2018", "version": "0.0.0", "splits": {"validation": {"name": "validation", "num_bytes": 515439, "num_examples": 1571, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 515439, "size_in_bytes": 515439}}
127 changes: 127 additions & 0 deletions promptsource/custom_datasets/story_cloze/story_cloze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Story Cloze datasets."""


import csv
import os

import datasets


_DESCRIPTION = """
Story Cloze Test' is a commonsense reasoning framework for evaluating story understanding,
story generation, and script learning.This test requires a system to choose the correct ending
to a four-sentence story.
"""

_CITATION = """\
@inproceedings{mostafazadeh2017lsdsem,
title={Lsdsem 2017 shared task: The story cloze test},
author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},
booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics},
pages={46--51},
year={2017}
}
"""


class StoryCloze(datasets.GeneratorBasedBuilder):
"""."""

BUILDER_CONFIGS = [
datasets.BuilderConfig(name="2016", description="Story Cloze Test Spring 2016 set"),
datasets.BuilderConfig(name="2018", description="Story Cloze Test Winter 2018 set"),
]

@property
def manual_download_instructions(self):
return (
"To use Sotry Cloze you have to download it manually. Please fill this "
"google form (http://goo.gl/forms/aQz39sdDrO). complete the form. "
"Then you will recieve a a download link for the dataset. Load it using : "
"`datasets.load_dataset('story_cloze', data_dir='path/to/folder/folder_name')`"
)

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"story_id": datasets.Value("string"),
"input_sentence_1": datasets.Value("string"),
"input_sentence_2": datasets.Value("string"),
"input_sentence_3": datasets.Value("string"),
"input_sentence_4": datasets.Value("string"),
"sentence_quiz1": datasets.Value("string"),
"sentence_quiz2": datasets.Value("string"),
"answer_right_ending": datasets.Value("int32"),
}
),
homepage="https://cs.rochester.edu/nlp/rocstories/",
citation=_CITATION,
)

def _split_generators(self, dl_manager):
path_to_manual_folder = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
if self.config.name == "2016":
test_file = os.path.join(path_to_manual_folder, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv")
val_file = os.path.join(path_to_manual_folder, "cloze_test_val__spring2016 - cloze_test_ALL_val.csv")
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": val_file,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": test_file,
},
),
]

else:
val_file = os.path.join(path_to_manual_folder, "cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")

return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": val_file,
},
),
]

def _generate_examples(self, filepath):
"""Generate Eduge news examples."""
with open(filepath, encoding="utf-8") as csv_file:
csv_reader = csv.reader(
csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
)
_ = next(csv_reader)
for id_, row in enumerate(csv_reader):
if row and len(row) == 8:
yield row[0], {
"story_id": row[0],
"input_sentence_1": row[1],
"input_sentence_2": row[2],
"input_sentence_3": row[3],
"input_sentence_4": row[4],
"sentence_quiz1": row[5],
"sentence_quiz2": row[6],
"answer_right_ending": int(row[7]),
}
8 changes: 8 additions & 0 deletions promptsource/seqio_tasks/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

import datasets
import pkg_resources
import tensorflow as tf

import promptsource.utils
Expand Down Expand Up @@ -60,6 +61,13 @@ def filter_fn(ex):


def get_dataset_splits(dataset_name, subset_name=None):
# `datasets.get_dataset_infos` pulls infos from hf/datasets's master.
# story_cloze hasn't been merged yet (https://github.com/huggingface/datasets/pull/2907)
# This is a temporary fix to be able to do `import promptsource.seqio_tasks`
# Once PR 2907 is merged, we can remove this if condition (along with the `custom_datasets` folder)
# Also see `promptsource.utils.get_dataset_builder`
if dataset_name == "story_cloze":
dataset_name = pkg_resources.resource_filename("promptsource", "custom_datasets/story_cloze")
info = datasets.get_dataset_infos(dataset_name)
subset_name = subset_name or list(info.keys())[0]
return info[subset_name].splits
Expand Down
112 changes: 112 additions & 0 deletions promptsource/templates/story_cloze/2016/templates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
dataset: story_cloze
subset: '2016'
templates:
1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8: !Template
answer_choices: null
answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
id: 1a4946f9-a0e2-4fbb-aee8-b26ead2cf6b8
jinja: '{{input_sentence_1}}
{{input_sentence_2}}
{{input_sentence_3}}
{{input_sentence_4}}
What is a possible continuation for the story given the following options ?
- {{answer_choices | join("\n- ")}}
|||
{{answer_choices[answer_right_ending -1]}}'
metadata: !TemplateMetadata
choices_in_prompt: true
metrics:
- Accuracy
original_task: true
name: Answer Given options
reference: ''
1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1: !Template
answer_choices: null
answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
id: 1a9d53bc-eb77-4e7c-af6e-3d15b79d6cf1
jinja: "Read the following story :\n\n{{input_sentence_1}}\n{{input_sentence_2}}\n\
{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose a possible ending for the\
\ previous story from the following options: \n- {{answer_choices | join(\"\\\
n- \")}}\n|||\n\n{{answer_choices[answer_right_ending -1]}}"
metadata: !TemplateMetadata
choices_in_prompt: true
metrics:
- Accuracy
original_task: true
name: Choose Story Ending
reference: ''
9dab69d1-cad0-4d2f-a7cc-120df233571c: !Template
answer_choices: null
answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
id: 9dab69d1-cad0-4d2f-a7cc-120df233571c
jinja: 'Yesterday, I watched a movie. Here''s what happened:
{{input_sentence_1}}
{{input_sentence_2}}
{{input_sentence_3}}
{{input_sentence_4}}
What happens next?
- {{answer_choices | join("\n- ")}}
|||
{{answer_choices[answer_right_ending -1]}}'
metadata: !TemplateMetadata
choices_in_prompt: true
metrics:
- Accuracy
original_task: true
name: Movie What Happens Next
reference: ''
b5c8445f-2d3a-4691-bdd5-58956816702f: !Template
answer_choices: null
answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
id: b5c8445f-2d3a-4691-bdd5-58956816702f
jinja: "What is a possible continuation for the following story ? \n\n{{input_sentence_1}}\n\
{{input_sentence_2}}\n{{input_sentence_3}}\n{{input_sentence_4}}\n\nChoose from\
\ the following options:\n- {{answer_choices | join(\"\\n- \")}}\n|||\n\n{{answer_choices[answer_right_ending\
\ -1]}}"
metadata: !TemplateMetadata
choices_in_prompt: true
metrics:
- Accuracy
original_task: true
name: Story Continuation and Options
reference: ''
baffa716-43cf-4954-a35c-655d775321e6: !Template
answer_choices: null
answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
id: baffa716-43cf-4954-a35c-655d775321e6
jinja: 'Generate a possible ending for the following story:
{{input_sentence_1}}
{{input_sentence_2}}
{{input_sentence_3}}
{{input_sentence_4}}
|||
{{answer_choices[answer_right_ending -1]}}'
metadata: !TemplateMetadata
choices_in_prompt: false
metrics:
- BLEU
- ROUGE
original_task: false
name: Generate Ending
reference: ''
c6f3d802-4f97-449f-a911-03470d418f7d: !Template
answer_choices: null
answer_choices_key: '{{sentence_quiz1}} ||| {{sentence_quiz2}}'
id: c6f3d802-4f97-449f-a911-03470d418f7d
jinja: 'I read the following novel:
{{input_sentence_1}}
{{input_sentence_2}}
{{input_sentence_3}}
{{input_sentence_4}}
What do you think is the most probable ending? You can choose from the following
options:
- {{answer_choices | join("\n- ")}}
|||
{{answer_choices[answer_right_ending -1]}}'
metadata: !TemplateMetadata
choices_in_prompt: true
metrics:
- Accuracy
original_task: true
name: Novel Correct Ending
reference: ''
8 changes: 8 additions & 0 deletions promptsource/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding=utf-8

import datasets
import pkg_resources
import requests


Expand Down Expand Up @@ -31,6 +32,13 @@ def renameDatasetColumn(dataset):

def get_dataset_builder(path, conf=None):
"Get a dataset builder from name and conf."
# `datasets.load.prepare_module` pulls infos from hf/datasets's master.
# story_cloze hasn't been merged yet (https://github.com/huggingface/datasets/pull/2907)
# This is a temporary fix for the tests (more specifically test_templates.py)
# Once PR 2907 is merged, we can remove this if condition (along with the `custom_datasets` folder)
# Also see `promptsource.seqio_tasks.utils.get_dataset_splits`
if path == "story_cloze":
path = pkg_resources.resource_filename("promptsource", "custom_datasets/story_cloze")
module_path = datasets.load.prepare_module(path, dataset=True)
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
if conf:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@
"templates/*/*/*.yaml",
"seqio_tasks/experiment_D3.csv", # Experiment D3
"seqio_tasks/experiment_D4.csv",
"custom_datasets/*/*"
]}
)