-
Notifications
You must be signed in to change notification settings - Fork 39
/
wapo.py
203 lines (166 loc) · 6.98 KB
/
wapo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import io
import json
import tarfile
from typing import NamedTuple, Tuple
import ir_datasets
from ir_datasets.indices import PickleLz4FullStore
from ir_datasets.util import Lazy, DownloadConfig, Migrator
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
from ir_datasets.formats import BaseDocs, BaseQueries, BaseQrels, GenericQuery, GenericQrel, TrecQueries, TrecQrels
NAME = 'wapo'
CORE_QREL_DEFS = {
0: "not relevant",
1: "relevant",
2: "highly relevant",
}
BL_QREL_DEFS = {
0: 'The document provides little or no useful background information.',
2: 'The document provides some useful background or contextual information that would help the user understand the broader story context of the target article.',
4: 'The document provides significantly useful background ...',
8: 'The document provides essential useful background ...',
16: 'The document _must_ appear in the sidebar otherwise critical context is missing.',
}
RM_TAGS = [' </num>', 'Narrative\n', '</docid>', '</url>']
BL_MAP = {
' *<num> Number: ': 'query_id',
' *<docid>': 'doc_id',
' *<url>': 'url',
}
class WapoDocMedia(NamedTuple):
type: str
url: str
text: str
class WapoDoc(NamedTuple):
doc_id: str
url: str
title: str
author: str
published_date: int
kicker: str
body: str
body_paras_html: Tuple[str, ...]
body_media: Tuple[WapoDocMedia, ...]
def default_text(self):
"""
title and body
"""
return f'{self.title} {self.body}'
class TrecBackgroundLinkingQuery(NamedTuple):
query_id: str
doc_id: str
url: str
class WapoDocs(BaseDocs):
def __init__(self, dlc):
self._dlc = dlc
def docs_path(self, force=True):
return self._dlc.path(force)
def docs_cls(self):
return WapoDoc
def docs_iter(self):
return iter(self.docs_store())
def _docs_iter(self):
BeautifulSoup = ir_datasets.lazy_libs.bs4().BeautifulSoup
for doc_json in self.docs_wapo_raw_iter():
body = ''
kicker = ''
body_paras_html = []
body_media = []
for content in doc_json['contents']:
if content is None:
continue
if content.get('type') == 'kicker':
assert content['mime'] == 'text/plain'
if content['content'] is not None:
kicker += content['content'] + '\n'
elif content.get('type') == 'sanitized_html':
if content.get('content') is not None:
body_paras_html.append(content['content'])
if content.get('mime') == 'text/html':
body += BeautifulSoup(content['content'], 'lxml-xml').get_text() + '\n'
else:
body += content['content'] + '\n'
elif content.get('type') in ['image', 'tweet', 'video', 'gallery']:
url = {
'image': lambda: content['imageURL'],
'video': lambda: content['contenturl'],
'gallery': lambda: content['contenturl'],
'tweet': lambda: f"https://twitter.com/{content['content']['user']['screen_name']}/status/{content['content']['id_str']}",
}[content['type']]()
text = {
'image': lambda: content.get('fullcaption'),
'video': lambda: content.get('blurb'),
'gallery': lambda: content.get('blurb'),
'tweet': lambda: content['content']['text'],
}[content['type']]()
body_media.append(WapoDocMedia(content['type'], url, text))
if text is not None:
body += text + '\n'
yield WapoDoc(
doc_json['id'],
doc_json['article_url'],
doc_json['title'],
doc_json['author'],
doc_json['published_date'],
kicker.rstrip('\n'),
body.rstrip('\n'),
tuple(body_paras_html),
tuple(body_media))
def docs_wapo_raw_iter(self):
with self._dlc.stream() as stream:
with tarfile.open(fileobj=stream, mode='r|gz') as tarf:
for member in tarf:
if member.name != 'WashingtonPost.v2/data/TREC_Washington_Post_collection.v2.jl':
continue
file = tarf.extractfile(member)
for line in file:
doc_json = json.loads(line)
yield doc_json
def docs_store(self, field='doc_id'):
return PickleLz4FullStore(
path=f'{self.docs_path()}.pklz4',
init_iter_fn=self._docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
return NAME
def docs_lang(self):
return 'en'
def _init():
subsets = {}
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
collection_v2 = WapoDocs(dlc['v2'])
base = Dataset(documentation('_'))
subsets['v2'] = Dataset(
collection_v2,
documentation('v2'))
subsets['v2/trec-core-2018'] = Dataset(
collection_v2,
TrecQueries(dlc['trec-core-2018/queries'], namespace='trec-core-2018', lang='en', remove_tags=RM_TAGS),
TrecQrels(dlc['trec-core-2018/qrels'], CORE_QREL_DEFS),
documentation('v2/trec-core-2018'))
subsets['v2/trec-news-2018'] = Dataset(
collection_v2,
TrecQueries(dlc['trec-news-2018/queries'], namespace='trec-news-2018', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS),
TrecQrels(dlc['trec-news-2018/qrels'], BL_QREL_DEFS),
documentation('v2/trec-news-2018'))
subsets['v2/trec-news-2019'] = Dataset(
collection_v2,
TrecQueries(dlc['trec-news-2019/queries'], namespace='trec-news-2019', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS),
TrecQrels(dlc['trec-news-2019/qrels'], BL_QREL_DEFS),
documentation('v2/trec-news-2019'))
subsets['v3/trec-news-2020'] = Dataset(
TrecQueries(dlc['trec-news-2020/queries'], namespace='trec-news-2020', lang='en', qtype=TrecBackgroundLinkingQuery, qtype_map=BL_MAP, remove_tags=RM_TAGS),
TrecQrels(dlc['trec-news-2020/qrels'], BL_QREL_DEFS),
documentation('v3/trec-news-2020'))
ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
return base, subsets
base, subsets = _init()