From b4eac55da67d9c2c521e0ece1d4b65a1a6bd14cc Mon Sep 17 00:00:00 2001 From: Kapil Kukreja <37610242+Kapilhk@users.noreply.github.com> Date: Fri, 25 Jun 2021 14:35:41 +0530 Subject: [PATCH] Corrected the logic to avoid redirect pages The original logic statement was allowing all titles where the colon was not found (colon<0) --- wikiextractor/WikiExtractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py index 509158a0..7a86874a 100755 --- a/wikiextractor/WikiExtractor.py +++ b/wikiextractor/WikiExtractor.py @@ -421,7 +421,7 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, page.append(line) elif tag == '/page': colon = title.find(':') - if (colon < 0 or (title[:colon] in acceptedNamespaces) and id != last_id and + if (colon < 0 or (title[:colon] in acceptedNamespaces)) and (id != last_id and not redirect and not title.startswith(templateNamespace)): job = (id, revid, urlbase, title, page, ordinal) jobs_queue.put(job) # goes to any available extract_process