-
Notifications
You must be signed in to change notification settings - Fork 271
/
hp_preprocess.py
96 lines (76 loc) · 3.14 KB
/
hp_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Instrocutions for prepraing the hyperpartisan dataset:
1- Download the original data from PAN at SemEval 2019 Task 4 https://zenodo.org/record/1489920
- the training subset: `articles-training-byarticle-20181122.zip`
- labels: `ground-truth-training-byarticle-20181122.zip`
2- Decompress the files (the output should be a single .xml file)
3- run this script with appropriate file paths
"""
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
import os
import simplejson as json
import codecs
import re
import io
import jsonlines
from collections import defaultdict
import pathlib
fp = io.BytesIO() # writable file-like object
writer = jsonlines.Writer(fp)
FLAGS = re.MULTILINE | re.DOTALL
def re_sub(pattern, repl, text, flags=None):
if flags is None:
return re.sub(pattern, repl, text, flags=FLAGS)
else:
return re.sub(pattern, repl, text, flags=(FLAGS | flags))
def clean_txt(text):
text = re.sub(r"[a-zA-Z]+\/[a-zA-Z]+", " ", text)
text = re.sub(r"\n", " ", text)
text = re.sub(r" ", "", text)
# Remove URL
text = re_sub(r"(http)\S+", "", text)
text = re_sub(r"(www)\S+", "", text)
text = re_sub(r"(href)\S+", "", text)
# Remove multiple spaces
text = re_sub(r"[ \s\t\n]+", " ", text)
# remove repetition
text = re_sub(r"([!?.]){2,}", r"\1", text)
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2", text)
return text.strip()
def write_jsonlist(list_of_json_objects, output_filename):
with jsonlines.open(output_filename, mode='w') as writer:
writer.write_all(list_of_json_objects)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--train-file', default='articles-training-byarticle-20181122.xml')
parser.add_argument('--labels-file', default='ground-truth-training-byarticle-20181122.xml')
parser.add_argument('--splits-file', default='hp-splits.json')
parser.add_argument('--output-dir', help='path to write outfile files')
args = parser.parse_args()
print('loading articles...')
articles_root = ET.parse(args.train_file).getroot()
print('loading labels...')
labels_root = ET.parse(args.labels_file).getroot()
articles = articles_root.findall('article')
labels = labels_root.findall('article')
assert len(articles) == len(labels)
data = {}
for article, label in tqdm(zip(articles, labels), total=len(labels), desc="preprocessing"):
text = ET.tostring(article, method='text', encoding="utf-8").decode('utf-8')
text = clean_txt(text)
id_ = int(label.attrib['id'])
data[id_] = {'text': text, 'label': label.attrib['hyperpartisan'], 'id': id_}
splits = defaultdict(list)
with open(args.splits_file) as f_in:
for split, ids in json.load(f_in).items():
for id_ in ids:
splits[split].append(data[id_])
for subset, data_list in splits.items():
output_filename = os.path.join(args.output_dir, subset + '.jsonl')
pathlib.Path(output_filename).parent.mkdir(parents=True, exist_ok=True)
write_jsonlist(data_list, output_filename)
if __name__ == '__main__':
main()