From ec32ce11cff6ee712e75ac20bbd1e47b5b21a2b3 Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Fri, 15 Oct 2021 16:56:13 -0400 Subject: [PATCH 1/2] remove custom datasets (story_cloze) --- .../story_cloze/dataset_infos.json | 1 - .../story_cloze/story_cloze.py | 127 ------------------ promptsource/seqio_tasks/utils.py | 7 - promptsource/utils.py | 7 - 4 files changed, 142 deletions(-) delete mode 100644 promptsource/custom_datasets/story_cloze/dataset_infos.json delete mode 100644 promptsource/custom_datasets/story_cloze/story_cloze.py diff --git a/promptsource/custom_datasets/story_cloze/dataset_infos.json b/promptsource/custom_datasets/story_cloze/dataset_infos.json deleted file mode 100644 index f95933711..000000000 --- a/promptsource/custom_datasets/story_cloze/dataset_infos.json +++ /dev/null @@ -1 +0,0 @@ -{"2016": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2016", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 614084, "num_examples": 1871, "dataset_name": "story_cloze"}, "test": {"name": "test", "num_bytes": 613184, "num_examples": 1871, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 1227268, "size_in_bytes": 1227268}, "2018": {"description": "", "citation": "", "homepage": "", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2018", "version": "0.0.0", "splits": {"validation": {"name": "validation", "num_bytes": 515439, "num_examples": 1571, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 515439, "size_in_bytes": 515439}} \ No newline at end of file diff --git a/promptsource/custom_datasets/story_cloze/story_cloze.py b/promptsource/custom_datasets/story_cloze/story_cloze.py deleted file mode 100644 index 68475dafe..000000000 --- a/promptsource/custom_datasets/story_cloze/story_cloze.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Story Cloze datasets.""" - - -import csv -import os - -import datasets - - -_DESCRIPTION = """ -Story Cloze Test' is a commonsense reasoning framework for evaluating story understanding, -story generation, and script learning.This test requires a system to choose the correct ending -to a four-sentence story. -""" - -_CITATION = """\ -@inproceedings{mostafazadeh2017lsdsem, - title={Lsdsem 2017 shared task: The story cloze test}, - author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James}, - booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics}, - pages={46--51}, - year={2017} -} -""" - - -class StoryCloze(datasets.GeneratorBasedBuilder): - """.""" - - BUILDER_CONFIGS = [ - datasets.BuilderConfig(name="2016", description="Story Cloze Test Spring 2016 set"), - datasets.BuilderConfig(name="2018", description="Story Cloze Test Winter 2018 set"), - ] - - @property - def manual_download_instructions(self): - return ( - "To use Sotry Cloze you have to download it manually. Please fill this " - "google form (http://goo.gl/forms/aQz39sdDrO). complete the form. " - "Then you will recieve a a download link for the dataset. Load it using : " - "`datasets.load_dataset('story_cloze', data_dir='path/to/folder/folder_name')`" - ) - - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - { - "story_id": datasets.Value("string"), - "input_sentence_1": datasets.Value("string"), - "input_sentence_2": datasets.Value("string"), - "input_sentence_3": datasets.Value("string"), - "input_sentence_4": datasets.Value("string"), - "sentence_quiz1": datasets.Value("string"), - "sentence_quiz2": datasets.Value("string"), - "answer_right_ending": datasets.Value("int32"), - } - ), - homepage="https://cs.rochester.edu/nlp/rocstories/", - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - path_to_manual_folder = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) - if self.config.name == "2016": - test_file = os.path.join(path_to_manual_folder, "cloze_test_test__spring2016 - cloze_test_ALL_test.csv") - val_file = os.path.join(path_to_manual_folder, "cloze_test_val__spring2016 - cloze_test_ALL_val.csv") - return [ - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": val_file, - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={ - "filepath": test_file, - }, - ), - ] - - else: - val_file = os.path.join(path_to_manual_folder, "cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv") - - return [ - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={ - "filepath": val_file, - }, - ), - ] - - def _generate_examples(self, filepath): - """Generate Eduge news examples.""" - with open(filepath, encoding="utf-8") as csv_file: - csv_reader = csv.reader( - csv_file, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True - ) - _ = next(csv_reader) - for id_, row in enumerate(csv_reader): - if row and len(row) == 8: - yield row[0], { - "story_id": row[0], - "input_sentence_1": row[1], - "input_sentence_2": row[2], - "input_sentence_3": row[3], - "input_sentence_4": row[4], - "sentence_quiz1": row[5], - "sentence_quiz2": row[6], - "answer_right_ending": int(row[7]), - } diff --git a/promptsource/seqio_tasks/utils.py b/promptsource/seqio_tasks/utils.py index 350464bed..a97591cad 100644 --- a/promptsource/seqio_tasks/utils.py +++ b/promptsource/seqio_tasks/utils.py @@ -64,13 +64,6 @@ def filter_fn(ex): def get_dataset_splits(dataset_name, subset_name=None): - # `datasets.get_dataset_infos` pulls infos from hf/datasets's master. - # story_cloze hasn't been merged yet (https://github.com/huggingface/datasets/pull/2907) - # This is a temporary fix to be able to do `import promptsource.seqio_tasks` - # Once PR 2907 is merged, we can remove this if condition (along with the `custom_datasets` folder) - # Also see `promptsource.utils.get_dataset_builder` - if dataset_name == "story_cloze": - dataset_name = pkg_resources.resource_filename("promptsource", "custom_datasets/story_cloze") info = datasets.get_dataset_infos(dataset_name) subset_name = subset_name or list(info.keys())[0] return info[subset_name].splits diff --git a/promptsource/utils.py b/promptsource/utils.py index efd4e06f3..0101ef01a 100644 --- a/promptsource/utils.py +++ b/promptsource/utils.py @@ -34,13 +34,6 @@ def renameDatasetColumn(dataset): def get_dataset_builder(path, conf=None): "Get a dataset builder from name and conf." - # `datasets.load.prepare_module` pulls infos from hf/datasets's master. - # story_cloze hasn't been merged yet (https://github.com/huggingface/datasets/pull/2907) - # This is a temporary fix for the tests (more specifically test_templates.py) - # Once PR 2907 is merged, we can remove this if condition (along with the `custom_datasets` folder) - # Also see `promptsource.seqio_tasks.utils.get_dataset_splits` - if path == "story_cloze": - path = pkg_resources.resource_filename("promptsource", "custom_datasets/story_cloze") module_path = datasets.load.prepare_module(path, dataset=True) builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) if conf: From a92ccbcb319f778ffbedd97635d5b85f15d48ece Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Fri, 15 Oct 2021 17:02:46 -0400 Subject: [PATCH 2/2] remove pkg --- promptsource/seqio_tasks/utils.py | 1 - promptsource/utils.py | 1 - 2 files changed, 2 deletions(-) diff --git a/promptsource/seqio_tasks/utils.py b/promptsource/seqio_tasks/utils.py index a97591cad..1b4df95aa 100644 --- a/promptsource/seqio_tasks/utils.py +++ b/promptsource/seqio_tasks/utils.py @@ -1,7 +1,6 @@ import re import datasets -import pkg_resources import tensorflow as tf import promptsource.utils diff --git a/promptsource/utils.py b/promptsource/utils.py index 0101ef01a..1ecf3a45b 100644 --- a/promptsource/utils.py +++ b/promptsource/utils.py @@ -1,7 +1,6 @@ # coding=utf-8 import datasets -import pkg_resources import requests from promptsource.templates import INCLUDED_USERS