forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
multi_nli_mismatch.py
124 lines (99 loc) 路 4.35 KB
/
multi_nli_mismatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""The Multi-Genre NLI Corpus."""
from __future__ import absolute_import, division, print_function
import os
import nlp
_CITATION = """\
@InProceedings{N18-1101,
author = {Williams, Adina
and Nangia, Nikita
and Bowman, Samuel},
title = {A Broad-Coverage Challenge Corpus for
Sentence Understanding through Inference},
booktitle = {Proceedings of the 2018 Conference of
the North American Chapter of the
Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long
Papers)},
year = {2018},
publisher = {Association for Computational Linguistics},
pages = {1112--1122},
location = {New Orleans, Louisiana},
url = {http://aclweb.org/anthology/N18-1101}
}
"""
_DESCRIPTION = """\
The Multi-Genre Natural Language Inference (MultiNLI) corpus is a
crowd-sourced collection of 433k sentence pairs annotated with textual
entailment information. The corpus is modeled on the SNLI corpus, but differs in
that covers a range of genres of spoken and written text, and supports a
distinctive cross-genre generalization evaluation. The corpus served as the
basis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.
"""
ROOT_URL = "http://storage.googleapis.com/tfds-data/downloads/multi_nli/multinli_1.0.zip"
class MultiNLIMismatchConfig(nlp.BuilderConfig):
"""BuilderConfig for MultiNLI Mismatch."""
def __init__(self, **kwargs):
"""BuilderConfig for MultiNLI Mismatch.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(MultiNLIMismatchConfig, self).__init__(
version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs
)
class MultiNliMismatch(nlp.GeneratorBasedBuilder):
"""MultiNLI: The Stanford Question Answering Dataset. Version 1.1."""
BUILDER_CONFIGS = [
MultiNLIMismatchConfig(name="plain_text", description="Plain text",),
]
def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features(
{"premise": nlp.Value("string"), "hypothesis": nlp.Value("string"), "label": nlp.Value("string"),}
),
# No default supervised_keys (as we have to pass both premise
# and hypothesis as input).
supervised_keys=None,
homepage="https://www.nyu.edu/projects/bowman/multinli/",
citation=_CITATION,
)
def _vocab_text_gen(self, filepath):
for _, ex in self._generate_examples(filepath):
yield " ".join([ex["premise"], ex["hypothesis"], ex["label"]])
def _split_generators(self, dl_manager):
downloaded_dir = dl_manager.download_and_extract(ROOT_URL)
mnli_path = os.path.join(downloaded_dir, "multinli_1.0")
train_path = os.path.join(mnli_path, "multinli_1.0_train.txt")
validation_path = os.path.join(mnli_path, "multinli_1.0_dev_mismatched.txt")
return [
nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": train_path}),
nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": validation_path}),
]
def _generate_examples(self, filepath):
"""Generate mnli mismatch examples.
Args:
filepath: a string
Yields:
dictionaries containing "premise", "hypothesis" and "label" strings
"""
for idx, line in enumerate(open(filepath, "rb")):
if idx == 0:
continue
line = line.strip().decode("utf-8")
split_line = line.split("\t")
yield idx, {"premise": split_line[5], "hypothesis": split_line[6], "label": split_line[0]}