/
trec_arabic.py
51 lines (37 loc) · 1.55 KB
/
trec_arabic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import ir_datasets
from ir_datasets.util import DownloadConfig
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
from ir_datasets.datasets.base import Dataset, YamlDocumentation
NAME = 'trec-arabic'
QREL_DEFS = {
1: 'relevant',
0: 'not relevant',
}
QTYPE_MAP = {
'<num> *(Number:)? *AR': 'query_id', # Remove AR prefix from QIDs
'<title> *(Topic:)?': 'title',
'<desc> *(Description:)?': 'description',
'<narr> *(Narrative:)?': 'narrative'
}
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection = TrecDocs(dlc['docs'], encoding='utf8', path_globs=['arabic_newswire_a/transcripts/*/*.sgm.gz'], namespace=NAME, lang='ar', count_hint=383872)
base = Dataset(collection, documentation('_'))
subsets['ar2001'] = Dataset(
TrecQueries(dlc['ar2001/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME, lang='ar'),
TrecQrels(dlc['ar2001/qrels'], QREL_DEFS),
collection,
documentation('ar2001'))
subsets['ar2002'] = Dataset(
TrecQueries(dlc['ar2002/queries'], qtype_map=QTYPE_MAP, encoding='ISO-8859-6', namespace=NAME, lang='ar'),
TrecQrels(dlc['ar2002/qrels'], QREL_DEFS),
collection,
documentation('ar2002'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()