-
Notifications
You must be signed in to change notification settings - Fork 39
/
trec_mandarin.py
71 lines (54 loc) · 2.19 KB
/
trec_mandarin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import GzipExtract, DownloadConfig
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'trec-mandarin'
class TrecMandarinQuery(NamedTuple):
query_id: str
title_en: str
title_zh: str
description_en: str
description_zh: str
narrative_en: str
narrative_zh: str
def default_text(self):
"""
title_zh
"""
return self.title_zh
QREL_DEFS = {
1: 'relevant',
0: 'not relevant',
}
QTYPE_MAP = {
'<num> *(Number:)? *CH': 'query_id', # Remove CH prefix from QIDs
'<E-title> *(Topic:)?': 'title_en',
'<C-title> *(Topic:)?': 'title_zh',
'<E-desc> *(Description:)?': 'description_en',
'<C-desc> *(Description:)?': 'description_zh',
'<E-narr> *(Narrative:)?': 'narrative_en',
'<C-narr> *(Narrative:)?': 'narrative_zh',
}
def _init():
subsets = {}
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
collection = TrecDocs(dlc['docs'], encoding='GB18030', path_globs=['**/xinhua/x*', '**/peoples-daily/pd*'], namespace=NAME, lang='zh', count_hint=ir_datasets.util.count_hint(NAME))
base = Dataset(collection, documentation('_'))
subsets['trec5'] = Dataset(
TrecQueries(GzipExtract(dlc['trec5/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
TrecQrels(GzipExtract(dlc['trec5/qrels']), QREL_DEFS),
collection,
documentation('trec5'))
subsets['trec6'] = Dataset(
TrecQueries(GzipExtract(dlc['trec6/queries']), qtype=TrecMandarinQuery, qtype_map=QTYPE_MAP, encoding='GBK', namespace=NAME, lang=None), # queries have multiple languages
TrecQrels(GzipExtract(dlc['trec6/qrels']), QREL_DEFS),
collection,
documentation('trec6'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()