TODO: handle disambiguation pages

In [2]:
from lxml import etree
import lxml.html
import wikitextparser as wtp
import nltk.data
import re
import json
import gzip
from tqdm import tqdm
from collections import defaultdict

In [3]:
with gzip.open('../data/simplewiki/simplewiki-20171103-pages-meta-current.xml.gz', 'rt', 'utf-8') as f:
    tree = etree.parse(f)

In [4]:
namespaces = {
    'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'
}

In [5]:
index = 200000
page = tree.xpath('mediawiki:page[%d]' % index, namespaces = namespaces)[0]
redirect = tree.xpath('mediawiki:page[%d]/mediawiki:redirect' % index, namespaces = namespaces)
ns = tree.xpath('mediawiki:page[%d]/mediawiki:ns' % index, namespaces = namespaces)
title = tree.xpath('mediawiki:page[%d]/mediawiki:title' % index, namespaces = namespaces)[0].text
text = tree.xpath('mediawiki:page[%d]/mediawiki:revision/mediawiki:text' % index, namespaces = namespaces)[0].text
title, ns[0].text, redirect

('Shell (weapons)',
 '0',
 [<Element {http://www.mediawiki.org/xml/export-0.10/}redirect at 0x7f97bdab8a08>])

In [6]:
def parse(title, raw_text):
    keep_tags = {'i','p','u','b','tt'}
    wiki_text = wtp.parse(raw_text)
    regions = []
    strs = []
    links = {}
    offset = 0
    
    for l in wiki_text.wikilinks:
        span = l.span
        discard = re.match(r'^[^:]+:', l.target) and not l.target.startswith('wikt:')
        target = l.target if not discard and not l.target.startswith('wikt:') else None
        text = l.text or l.target
        regions.append((span, discard, target, text))
        
    for l in wiki_text.external_links:
        span = l.span
        discard = False
        target = None
        text = l.text or l.url
        regions.append((span, discard, target, text))
        
    for t in wiki_text.templates:
        span = t.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
    
    for t in wiki_text.tags():
        span = t.span
        target = None
        if (span[1] <= span[0]) or (span[1] - span[0] > 100):
            discard = True
            text = None
        else:
            try:
                discard = t.name not in keep_tags
                text = t.contents
            except (TypeError, AttributeError) as e:
                discard = True
                text = None
        regions.append((span, discard, target, text))
    
    for t in wiki_text.tables:
        span = t.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
    
    for l in wiki_text.lists():
        span = l.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
        
    for pf in wiki_text.parser_functions:
        span = pf.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
        
    for c in wiki_text.comments:
        span = c.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
        
    regions.sort()
    
    for (span, discard, target, text) in regions:
        if span[0] < offset:
            continue
        strs.append(wiki_text.string[offset:span[0]])
        if not discard:
            if target:
                token = '{{%d}}' % len(links)
                strs.append(token)
                links[token] = {'text': text, 'target': target.upper()}
            else:
                strs.append(text)
        offset = span[1]
    
    strs.append(wiki_text.string[offset:])
    
    text = ''.join(strs)
    
    text = re.sub(r"''+", '', text)
    text = re.sub(r"==+[^=\n]+==+", '', text)
    text = re.sub('&nbsp;', ' ', text)
    text = re.sub('\n\n+', '\n\n', text)
    text = text.strip()
    
    return {
        'title': title.upper(),
        'text': text,
        'links': links
    }

In [7]:
def parse_page_xml(page_xml):
    redirects = page_xml.xpath('mediawiki:redirect', namespaces = namespaces)
    ns = page_xml.xpath('mediawiki:ns', namespaces = namespaces)
    titles = page_xml.xpath('mediawiki:title', namespaces = namespaces)
    texts = page_xml.xpath('mediawiki:revision/mediawiki:text', namespaces = namespaces)
    
    if len(redirects) > 1:
        raise Exception('found multiple redirects')
    if len(ns) > 1:
        raise Exception('found multiple namespaces')
    if len(titles) > 1:
        raise Exception('found multiple titles')
    if len(texts) > 1:
        raise Exception('found multiple texts')
    
    if ns[0].text != '0':
        return None
    if not texts[0].text or not titles[0].text:
        return None
    
    if len(redirects) > 0:
        target = redirects[0].attrib['title']
        if target:
            return {'title': titles[0].text.upper(), 'redirect': target.upper()}
        else:
            return None
    
    return parse(titles[0].text.upper(), texts[0].text)

In [8]:
def parse_dump_xml(dump_xml, limit = None):
    path = 'mediawiki:page'
    if limit:
        path += '[position() < %d]' % limit
    page_xmls = dump_xml.xpath(path, namespaces = namespaces)
    
    result = {}
    for page_xml in tqdm(page_xmls):
        parsed_page = parse_page_xml(page_xml)
        if parsed_page:
            result[parsed_page['title']] = parsed_page
    return result

In [9]:
parsed = parse_dump_xml(tree)

100%|██████████| 437190/437190 [09:34<00:00, 761.08it/s]


In [13]:
def resolve_redirects(parsed):
    for title, page in tqdm(parsed.items()):
        if 'links' in page:
            for token, link in page['links'].items():
                old_target = parsed.get(link['target'])
                new_target = old_target
                depth = 0
                while depth < 100 and new_target and 'redirect' in new_target:
                    new_target = parsed.get(new_target['redirect'])
                    depth += 1
                if new_target and not (old_target is new_target):
                    #print('redirecting %s -> %s' % (old_target["title"], new_target["title"]))
                    link['target'] = new_target['title']

In [14]:
resolve_redirects(parsed)

100%|██████████| 177277/177277 [00:03<00:00, 58134.71it/s]


In [15]:
def remove_unresolved_links(parsed):
    num_resolved = 0
    num_unresolved = 0
    for title, page in tqdm(parsed.items()):
        if 'links' in page:
            unresolved = {}
            for token, link in page['links'].items():
                target = parsed.get(link['target'])
                if not target or 'redirect' in target:
                    num_unresolved += 1
                    unresolved[token] = link['text']
                else:
                    num_resolved += 1
            for token, text in unresolved.items():
                page['text'] = page['text'].replace(token, text)
                page['links'].pop(token, None)
    print(num_resolved, num_unresolved)

In [17]:
remove_unresolved_links(parsed)

100%|██████████| 177277/177277 [00:01<00:00, 142776.45it/s]

1055666 0





In [18]:
def remove_redirect_pages(parsed):
    to_remove = [title for title, page in tqdm(parsed.items()) if 'redirect' in page]
    for title in to_remove:
        del parsed[title]
    print(len(to_remove))

In [19]:
remove_redirect_pages(parsed)

100%|██████████| 177277/177277 [00:00<00:00, 1001274.74it/s]

51330





In [22]:
def assign_ids(parsed):
    counter = 0
    for _, page in tqdm(parsed.items()):
        page['id'] = counter
        counter += 1

In [23]:
assign_ids(parsed)

100%|██████████| 125947/125947 [00:00<00:00, 842361.42it/s]


In [25]:
len(parsed)

125947

In [26]:
with gzip.open('../data/simplewiki/simplewiki-20171103.parsed.json.gz', 'wt', encoding='utf-8') as f:
    json.dump(parsed, f)

In [29]:
page = list(parsed.items())[0][1]