-
Notifications
You must be signed in to change notification settings - Fork 9
/
TestParser.java
76 lines (63 loc) · 2.5 KB
/
TestParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/*
* Copyright 2015 Fluo authors (see AUTHORS)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package io.fluo.webindex.data;
import java.net.URL;
import java.util.List;
import io.fluo.webindex.core.DataConfig;
import io.fluo.webindex.data.spark.IndexEnv;
import io.fluo.webindex.data.util.ArchiveUtil;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.WARCReaderFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class TestParser {
private static final Logger log = LoggerFactory.getLogger(TestParser.class);
public static void main(String[] args) throws Exception {
if (args.length != 2) {
log.error("Usage: TestParser <pathsFile> <range>");
System.exit(1);
}
final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
if (loadList.isEmpty()) {
log.error("No files to load given {} {}", args[0], args[1]);
System.exit(1);
}
DataConfig.load();
SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {
log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
args[0]);
JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());
final String prefix = DataConfig.CC_URL_PREFIX;
loadRDD.foreachPartition(iter -> {
iter.forEachRemaining(path -> {
String urlToCopy = prefix + path;
log.info("Parsing {}", urlToCopy);
try {
ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
for (ArchiveRecord record : reader) {
ArchiveUtil.buildPageIgnoreErrors(record);
}
} catch (Exception e) {
log.error("Exception while processing {}", path, e);
}
});
});
}
}
}