/
trec_spanish.py
110 lines (88 loc) · 3.57 KB
/
trec_spanish.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import GzipExtract, DownloadConfig
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'trec-spanish'
class TrecDescOnlyQuery(NamedTuple):
query_id: str
description: str
class TrecSpanish3Query(NamedTuple):
query_id: str
title_es: str
title_en: str
description_es: str
description_en: str
narrative_es: str
narrative_en: str
class TrecSpanish4Query(NamedTuple):
query_id: str
description_es1: str
description_en1: str
description_es2: str
description_en2: str
QREL_DEFS = {
1: 'relevant',
0: 'not relevant',
}
QTYPE_MAP_3 = {
'<num> *(Number:)? *SP': 'query_id', # Remove SP prefix from QIDs
'<title> *(Topic:)?': 'title',
'<desc> *(Description:)?': 'description',
'<narr> *(Narrative:)?': 'narrative',
}
QTYPE_MAP_4 = {
'<num> *(Number:)? *SP': 'query_id', # Remove SP prefix from QIDs
'<desc> *(Description:)?': 'description',
}
# TREC Spanish has this strange convention where lines that start with ** are
# translations of the query. Rather than trying to bake this into TrecQueries,
# I'm using an adapter to apply this just for TREC Spanish.
class TrecSpanishTranslateQueries:
def __init__(self, parent, query_cls):
self._parent = parent
self._query_cls = query_cls
def __getattr__(self, attr):
return getattr(self._parent, attr)
def queries_iter(self):
qcls = self._query_cls
for query in self._parent.queries_iter():
qid = query.query_id
tup = [qid,]
for value in query[1:]:
tup.append('')
for line in value.split('\n'):
if line.strip() == '':
tup[-1] = tup[-1].strip()
tup.append('')
# Translations begin with **
if line.lstrip().startswith('**'):
line = line.lstrip()[2:]
tup[-1] += line.strip() + ' '
# Sometimes not all translations are available. Fill in remaining with blanks
tup += [''] * (len(qcls._fields) - len(tup))
yield qcls(*tup)
def queries_cls(self):
return self._query_cls
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = TrecDocs(dlc['docs'], encoding='ISO-8859-1', path_globs=['**/afp_text/af*', '**/infosel_data/ism_*'], namespace=NAME, lang='es', count_hint=120605)
base = Dataset(collection, documentation('_'))
subsets['trec3'] = Dataset(
TrecSpanishTranslateQueries(TrecQueries(GzipExtract(dlc['trec3/queries']), qtype_map=QTYPE_MAP_3, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish3Query),
TrecQrels(GzipExtract(dlc['trec3/qrels']), QREL_DEFS),
collection,
documentation('trec3'))
subsets['trec4'] = Dataset(
TrecSpanishTranslateQueries(TrecQueries(GzipExtract(dlc['trec4/queries']), qtype=TrecDescOnlyQuery, qtype_map=QTYPE_MAP_4, encoding='ISO-8859-1', namespace=NAME, lang=None), TrecSpanish4Query),
TrecQrels(GzipExtract(dlc['trec4/qrels']), QREL_DEFS),
collection,
documentation('trec4'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()