diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 830235d..61cc61c 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -546,8 +546,13 @@ def main(): metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") - groupO.add_argument("--json", action="store_true", - help="write output in json format instead of the default format") + groupOFormat = groupO.add_mutually_exclusive_group() + groupOFormat.add_argument("--json", action="store_true", + help="write output in json format instead of the default format") + groupOFormat.add_argument("--text", action="store_true", + help="write output in text format (body only, no title) instead of the default format") + groupO.add_argument("--discard_empty", action="store_true", + help="discard empty articles (such as redirects) rather than writing just the title") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", @@ -584,6 +589,8 @@ def main(): if args.html: Extractor.keepLinks = True Extractor.to_json = args.json + Extractor.to_text = args.text + Extractor.discard_empty = args.discard_empty try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 @@ -607,6 +614,13 @@ def main(): if args.debug: logger.setLevel(logging.DEBUG) + if args.json: + logger.debug("Outputting to json format") + elif args.text: + logger.debug("Outputting to text format") + else: + logger.debug("Outputting to format") + input_file = args.input if not Extractor.keepLinks: diff --git a/wikiextractor/extract.py b/wikiextractor/extract.py index a00e23d..c3bc284 100644 --- a/wikiextractor/extract.py +++ b/wikiextractor/extract.py @@ -974,7 +974,9 @@ def extract(self, out, html_safe=True): text = ''.join(self.page) text = self.clean_text(text, html_safe=html_safe) - if self.to_json: + if self.discard_empty and not text: + pass + elif self.to_json: json_data = { 'id': self.id, 'revid': self.revid, @@ -985,6 +987,9 @@ def extract(self, out, html_safe=True): out_str = json.dumps(json_data) out.write(out_str) out.write('\n') + elif self.to_text: + out.write('\n'.join(text)) + out.write('\n\n\n') else: header = '\n' % (self.id, self.url, self.title) # Separate header from text with a newline.