forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
quarel.py
103 lines (91 loc) 路 4.12 KB
/
quarel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""TODO(quarel): Add a description here."""
from __future__ import absolute_import, division, print_function
import json
import os
import datasets
# TODO(quarel): BibTeX citation
_CITATION = """\
@inproceedings{quarel_v1,
title={QuaRel: A Dataset and Models for Answering Questions about Qualitative Relationships},
author={Oyvind Tafjord, Peter Clark, Matt Gardner, Wen-tau Yih, Ashish Sabharwal},
year={2018},
journal={arXiv:1805.05377v1}
}
"""
# TODO(quarel):
_DESCRIPTION = """
QuaRel is a crowdsourced dataset of 2771 multiple-choice story questions, including their logical forms.
"""
_URL = "https://s3-us-west-2.amazonaws.com/ai2-website/data/quarel-dataset-v1-nov2018.zip"
class Quarel(datasets.GeneratorBasedBuilder):
"""TODO(quarel): Short description of my dataset."""
# TODO(quarel): Set up version.
VERSION = datasets.Version("0.1.0")
def _info(self):
# TODO(quarel): Specifies the datasets.DatasetInfo object
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
# These are the features of your dataset like images, labels ...
"id": datasets.Value("string"),
"answer_index": datasets.Value("int32"),
"logical_forms": datasets.features.Sequence(datasets.Value("string")),
"logical_form_pretty": datasets.Value("string"),
"world_literals": datasets.features.Sequence(
{"world1": datasets.Value("string"), "world2": datasets.Value("string")}
),
"question": datasets.Value("string"),
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage="https://allenai.org/data/quarel",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# TODO(quarel): Downloads the data and defines the splits
# dl_manager is a datasets.download.DownloadManager that can be used to
# download and extract URLs
dl_dir = dl_manager.download_and_extract(_URL)
data_dir = os.path.join(dl_dir, "quarel-dataset-v1")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": os.path.join(data_dir, "quarel-v1-train.jsonl")},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": os.path.join(data_dir, "quarel-v1-test.jsonl")},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": os.path.join(data_dir, "quarel-v1-dev.jsonl")},
),
]
def _generate_examples(self, filepath):
"""Yields examples."""
# TODO(quarel): Yields (key, example) tuples from the dataset
with open(filepath, encoding="utf-8") as f:
for id_, row in enumerate(f):
data = json.loads(row)
yield id_, {
"id": data["id"],
"answer_index": data["answer_index"],
"logical_forms": data["logical_forms"],
"world_literals": {
"world1": [data["world_literals"]["world1"]],
"world2": [data["world_literals"]["world2"]],
},
"logical_form_pretty": data["logical_form_pretty"],
"question": data["question"],
}