TODO: handle disambiguation pages

In [2]:
from lxml import etree
import lxml.html
import wikitextparser as wtp
import nltk.data
import re
import json
from collections import defaultdict

In [4]:
tree = etree.parse('c:\\Users\\achang\\Downloads\\simplewiki-20171103-pages-meta-current.xml')

In [5]:
namespaces = {
    'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'
}

In [175]:
index = 200000
page = tree.xpath(f'mediawiki:page[{index}]', namespaces = namespaces)[0]
redirect = tree.xpath(f'mediawiki:page[{index}]/mediawiki:redirect', namespaces = namespaces)
ns = tree.xpath(f'mediawiki:page[{index}]/mediawiki:ns', namespaces = namespaces)
title = tree.xpath(f'mediawiki:page[{index}]/mediawiki:title', namespaces = namespaces)[0].text
text = tree.xpath(f'mediawiki:page[{index}]/mediawiki:revision/mediawiki:text', namespaces = namespaces)[0].text
title, ns[0].text, redirect

('Shell (weapons)',
 '0',
 [<Element {http://www.mediawiki.org/xml/export-0.10/}redirect at 0x1ff07018708>])

In [183]:
def parse(title, raw_text):
    keep_tags = {'i','p','u','b','tt'}
    wiki_text = wtp.parse(raw_text)
    regions = []
    strs = []
    links = {}
    offset = 0
    
    for l in wiki_text.wikilinks:
        span = l.span
        discard = re.match(r'^[^:]+:', l.target) and not l.target.startswith('wikt:')
        target = l.target if not discard and not l.target.startswith('wikt:') else None
        text = l.text or l.target
        regions.append((span, discard, target, text))
        
    for l in wiki_text.external_links:
        span = l.span
        discard = False
        target = None
        text = l.text or l.url
        regions.append((span, discard, target, text))
        
    for t in wiki_text.templates:
        span = t.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
    
    for t in wiki_text.tags():
        span = t.span
        target = None
        if (span[1] <= span[0]) or (span[1] - span[0] > 100):
            discard = True
            text = None
        else:
            try:
                discard = t.name not in keep_tags
                text = t.contents
            except (TypeError, AttributeError) as e:
                discard = True
                text = None
        regions.append((span, discard, target, text))
    
    for t in wiki_text.tables:
        span = t.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
    
    for l in wiki_text.lists():
        span = l.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
        
    for pf in wiki_text.parser_functions:
        span = pf.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
        
    for c in wiki_text.comments:
        span = c.span
        discard = True
        target = None
        text = None
        regions.append((span, discard, target, text))
        
    regions.sort()
    
    for (span, discard, target, text) in regions:
        if span[0] < offset:
            continue
        strs.append(wiki_text.string[offset:span[0]])
        if not discard:
            if target:
                token = '{{%d}}' % len(links)
                strs.append(token)
                links[token] = {'text': text, 'target': target.upper()}
            else:
                strs.append(text)
        offset = span[1]
    
    strs.append(wiki_text.string[offset:])
    
    text = ''.join(strs)
    
    text = re.sub(r"''+", '', text)
    text = re.sub(r"==+[^=\n]+==+", '', text)
    text = re.sub('&nbsp;', ' ', text)
    text = re.sub('\n\n+', '\n\n', text)
    text = text.strip()
    
    return {
        'title': title.upper(),
        'text': text,
        'links': links
    }

In [185]:
def parse_page_xml(page_xml):
    redirects = page_xml.xpath(f'mediawiki:redirect', namespaces = namespaces)
    ns = page_xml.xpath(f'mediawiki:ns', namespaces = namespaces)
    titles = page_xml.xpath(f'mediawiki:title', namespaces = namespaces)
    texts = page_xml.xpath(f'mediawiki:revision/mediawiki:text', namespaces = namespaces)
    
    if len(redirects) > 1:
        raise Exception('found multiple redirects')
    if len(ns) > 1:
        raise Exception('found multiple namespaces')
    if len(titles) > 1:
        raise Exception('found multiple titles')
    if len(texts) > 1:
        raise Exception('found multiple texts')
    
    if ns[0].text != '0':
        return None
    if not texts[0].text or not titles[0].text:
        return None
    
    if len(redirects) > 0:
        target = redirects[0].attrib['title']
        if target:
            return {'title': titles[0].text.upper(), 'redirect': target.upper()}
        else:
            return None
    
    return parse(titles[0].text.upper(), texts[0].text)

In [186]:
def parse_dump_xml(dump_xml, limit = None):
    path = 'mediawiki:page'
    if limit:
        path += f'[position() < {limit}]'
    page_xmls = dump_xml.xpath(path, namespaces = namespaces)
    
    result = {}
    for i, page_xml in enumerate(page_xmls):
        if i % 100 == 0:
            print(f'parsed {i} pages...')
        parsed_page = parse_page_xml(page_xml)
        if parsed_page:
            result[parsed_page['title']] = parsed_page
    return result

In [189]:
parsed = parse_dump_xml(tree)

parsed 0 pages...
parsed 100 pages...
parsed 200 pages...
parsed 300 pages...
parsed 400 pages...
parsed 500 pages...
parsed 600 pages...
parsed 700 pages...
parsed 800 pages...
parsed 900 pages...
parsed 1000 pages...
parsed 1100 pages...
parsed 1200 pages...
parsed 1300 pages...
parsed 1400 pages...
parsed 1500 pages...
parsed 1600 pages...
parsed 1700 pages...
parsed 1800 pages...
parsed 1900 pages...
parsed 2000 pages...
parsed 2100 pages...
parsed 2200 pages...
parsed 2300 pages...
parsed 2400 pages...
parsed 2500 pages...
parsed 2600 pages...
parsed 2700 pages...
parsed 2800 pages...
parsed 2900 pages...
parsed 3000 pages...
parsed 3100 pages...
parsed 3200 pages...
parsed 3300 pages...
parsed 3400 pages...
parsed 3500 pages...
parsed 3600 pages...
parsed 3700 pages...
parsed 3800 pages...
parsed 3900 pages...
parsed 4000 pages...
parsed 4100 pages...
parsed 4200 pages...
parsed 4300 pages...
parsed 4400 pages...
parsed 4500 pages...
parsed 4600 pages...
parsed 4700 pages...
pars

parsed 37900 pages...
parsed 38000 pages...
parsed 38100 pages...
parsed 38200 pages...
parsed 38300 pages...
parsed 38400 pages...
parsed 38500 pages...
parsed 38600 pages...
parsed 38700 pages...
parsed 38800 pages...
parsed 38900 pages...
parsed 39000 pages...
parsed 39100 pages...
parsed 39200 pages...
parsed 39300 pages...
parsed 39400 pages...
parsed 39500 pages...
parsed 39600 pages...
parsed 39700 pages...
parsed 39800 pages...
parsed 39900 pages...
parsed 40000 pages...
parsed 40100 pages...
parsed 40200 pages...
parsed 40300 pages...
parsed 40400 pages...
parsed 40500 pages...
parsed 40600 pages...
parsed 40700 pages...
parsed 40800 pages...
parsed 40900 pages...
parsed 41000 pages...
parsed 41100 pages...
parsed 41200 pages...
parsed 41300 pages...
parsed 41400 pages...
parsed 41500 pages...
parsed 41600 pages...
parsed 41700 pages...
parsed 41800 pages...
parsed 41900 pages...
parsed 42000 pages...
parsed 42100 pages...
parsed 42200 pages...
parsed 42300 pages...
parsed 424

parsed 75200 pages...
parsed 75300 pages...
parsed 75400 pages...
parsed 75500 pages...
parsed 75600 pages...
parsed 75700 pages...
parsed 75800 pages...
parsed 75900 pages...
parsed 76000 pages...
parsed 76100 pages...
parsed 76200 pages...
parsed 76300 pages...
parsed 76400 pages...
parsed 76500 pages...
parsed 76600 pages...
parsed 76700 pages...
parsed 76800 pages...
parsed 76900 pages...
parsed 77000 pages...
parsed 77100 pages...
parsed 77200 pages...
parsed 77300 pages...
parsed 77400 pages...
parsed 77500 pages...
parsed 77600 pages...
parsed 77700 pages...
parsed 77800 pages...
parsed 77900 pages...
parsed 78000 pages...
parsed 78100 pages...
parsed 78200 pages...
parsed 78300 pages...
parsed 78400 pages...
parsed 78500 pages...
parsed 78600 pages...
parsed 78700 pages...
parsed 78800 pages...
parsed 78900 pages...
parsed 79000 pages...
parsed 79100 pages...
parsed 79200 pages...
parsed 79300 pages...
parsed 79400 pages...
parsed 79500 pages...
parsed 79600 pages...
parsed 797

parsed 112100 pages...
parsed 112200 pages...
parsed 112300 pages...
parsed 112400 pages...
parsed 112500 pages...
parsed 112600 pages...
parsed 112700 pages...
parsed 112800 pages...
parsed 112900 pages...
parsed 113000 pages...
parsed 113100 pages...
parsed 113200 pages...
parsed 113300 pages...
parsed 113400 pages...
parsed 113500 pages...
parsed 113600 pages...
parsed 113700 pages...
parsed 113800 pages...
parsed 113900 pages...
parsed 114000 pages...
parsed 114100 pages...
parsed 114200 pages...
parsed 114300 pages...
parsed 114400 pages...
parsed 114500 pages...
parsed 114600 pages...
parsed 114700 pages...
parsed 114800 pages...
parsed 114900 pages...
parsed 115000 pages...
parsed 115100 pages...
parsed 115200 pages...
parsed 115300 pages...
parsed 115400 pages...
parsed 115500 pages...
parsed 115600 pages...
parsed 115700 pages...
parsed 115800 pages...
parsed 115900 pages...
parsed 116000 pages...
parsed 116100 pages...
parsed 116200 pages...
parsed 116300 pages...
parsed 1164

parsed 147900 pages...
parsed 148000 pages...
parsed 148100 pages...
parsed 148200 pages...
parsed 148300 pages...
parsed 148400 pages...
parsed 148500 pages...
parsed 148600 pages...
parsed 148700 pages...
parsed 148800 pages...
parsed 148900 pages...
parsed 149000 pages...
parsed 149100 pages...
parsed 149200 pages...
parsed 149300 pages...
parsed 149400 pages...
parsed 149500 pages...
parsed 149600 pages...
parsed 149700 pages...
parsed 149800 pages...
parsed 149900 pages...
parsed 150000 pages...
parsed 150100 pages...
parsed 150200 pages...
parsed 150300 pages...
parsed 150400 pages...
parsed 150500 pages...
parsed 150600 pages...
parsed 150700 pages...
parsed 150800 pages...
parsed 150900 pages...
parsed 151000 pages...
parsed 151100 pages...
parsed 151200 pages...
parsed 151300 pages...
parsed 151400 pages...
parsed 151500 pages...
parsed 151600 pages...
parsed 151700 pages...
parsed 151800 pages...
parsed 151900 pages...
parsed 152000 pages...
parsed 152100 pages...
parsed 1522

parsed 183600 pages...
parsed 183700 pages...
parsed 183800 pages...
parsed 183900 pages...
parsed 184000 pages...
parsed 184100 pages...
parsed 184200 pages...
parsed 184300 pages...
parsed 184400 pages...
parsed 184500 pages...
parsed 184600 pages...
parsed 184700 pages...
parsed 184800 pages...
parsed 184900 pages...
parsed 185000 pages...
parsed 185100 pages...
parsed 185200 pages...
parsed 185300 pages...
parsed 185400 pages...
parsed 185500 pages...
parsed 185600 pages...
parsed 185700 pages...
parsed 185800 pages...
parsed 185900 pages...
parsed 186000 pages...
parsed 186100 pages...
parsed 186200 pages...
parsed 186300 pages...
parsed 186400 pages...
parsed 186500 pages...
parsed 186600 pages...
parsed 186700 pages...
parsed 186800 pages...
parsed 186900 pages...
parsed 187000 pages...
parsed 187100 pages...
parsed 187200 pages...
parsed 187300 pages...
parsed 187400 pages...
parsed 187500 pages...
parsed 187600 pages...
parsed 187700 pages...
parsed 187800 pages...
parsed 1879

parsed 219500 pages...
parsed 219600 pages...
parsed 219700 pages...
parsed 219800 pages...
parsed 219900 pages...
parsed 220000 pages...
parsed 220100 pages...
parsed 220200 pages...
parsed 220300 pages...
parsed 220400 pages...
parsed 220500 pages...
parsed 220600 pages...
parsed 220700 pages...
parsed 220800 pages...
parsed 220900 pages...
parsed 221000 pages...
parsed 221100 pages...
parsed 221200 pages...
parsed 221300 pages...
parsed 221400 pages...
parsed 221500 pages...
parsed 221600 pages...
parsed 221700 pages...
parsed 221800 pages...
parsed 221900 pages...
parsed 222000 pages...
parsed 222100 pages...
parsed 222200 pages...
parsed 222300 pages...
parsed 222400 pages...
parsed 222500 pages...
parsed 222600 pages...
parsed 222700 pages...
parsed 222800 pages...
parsed 222900 pages...
parsed 223000 pages...
parsed 223100 pages...
parsed 223200 pages...
parsed 223300 pages...
parsed 223400 pages...
parsed 223500 pages...
parsed 223600 pages...
parsed 223700 pages...
parsed 2238

parsed 255200 pages...
parsed 255300 pages...
parsed 255400 pages...
parsed 255500 pages...
parsed 255600 pages...
parsed 255700 pages...
parsed 255800 pages...
parsed 255900 pages...
parsed 256000 pages...
parsed 256100 pages...
parsed 256200 pages...
parsed 256300 pages...
parsed 256400 pages...
parsed 256500 pages...
parsed 256600 pages...
parsed 256700 pages...
parsed 256800 pages...
parsed 256900 pages...
parsed 257000 pages...
parsed 257100 pages...
parsed 257200 pages...
parsed 257300 pages...
parsed 257400 pages...
parsed 257500 pages...
parsed 257600 pages...
parsed 257700 pages...
parsed 257800 pages...
parsed 257900 pages...
parsed 258000 pages...
parsed 258100 pages...
parsed 258200 pages...
parsed 258300 pages...
parsed 258400 pages...
parsed 258500 pages...
parsed 258600 pages...
parsed 258700 pages...
parsed 258800 pages...
parsed 258900 pages...
parsed 259000 pages...
parsed 259100 pages...
parsed 259200 pages...
parsed 259300 pages...
parsed 259400 pages...
parsed 2595

parsed 291100 pages...
parsed 291200 pages...
parsed 291300 pages...
parsed 291400 pages...
parsed 291500 pages...
parsed 291600 pages...
parsed 291700 pages...
parsed 291800 pages...
parsed 291900 pages...
parsed 292000 pages...
parsed 292100 pages...
parsed 292200 pages...
parsed 292300 pages...
parsed 292400 pages...
parsed 292500 pages...
parsed 292600 pages...
parsed 292700 pages...
parsed 292800 pages...
parsed 292900 pages...
parsed 293000 pages...
parsed 293100 pages...
parsed 293200 pages...
parsed 293300 pages...
parsed 293400 pages...
parsed 293500 pages...
parsed 293600 pages...
parsed 293700 pages...
parsed 293800 pages...
parsed 293900 pages...
parsed 294000 pages...
parsed 294100 pages...
parsed 294200 pages...
parsed 294300 pages...
parsed 294400 pages...
parsed 294500 pages...
parsed 294600 pages...
parsed 294700 pages...
parsed 294800 pages...
parsed 294900 pages...
parsed 295000 pages...
parsed 295100 pages...
parsed 295200 pages...
parsed 295300 pages...
parsed 2954

parsed 326800 pages...
parsed 326900 pages...
parsed 327000 pages...
parsed 327100 pages...
parsed 327200 pages...
parsed 327300 pages...
parsed 327400 pages...
parsed 327500 pages...
parsed 327600 pages...
parsed 327700 pages...
parsed 327800 pages...
parsed 327900 pages...
parsed 328000 pages...
parsed 328100 pages...
parsed 328200 pages...
parsed 328300 pages...
parsed 328400 pages...
parsed 328500 pages...
parsed 328600 pages...
parsed 328700 pages...
parsed 328800 pages...
parsed 328900 pages...
parsed 329000 pages...
parsed 329100 pages...
parsed 329200 pages...
parsed 329300 pages...
parsed 329400 pages...
parsed 329500 pages...
parsed 329600 pages...
parsed 329700 pages...
parsed 329800 pages...
parsed 329900 pages...
parsed 330000 pages...
parsed 330100 pages...
parsed 330200 pages...
parsed 330300 pages...
parsed 330400 pages...
parsed 330500 pages...
parsed 330600 pages...
parsed 330700 pages...
parsed 330800 pages...
parsed 330900 pages...
parsed 331000 pages...
parsed 3311

parsed 362600 pages...
parsed 362700 pages...
parsed 362800 pages...
parsed 362900 pages...
parsed 363000 pages...
parsed 363100 pages...
parsed 363200 pages...
parsed 363300 pages...
parsed 363400 pages...
parsed 363500 pages...
parsed 363600 pages...
parsed 363700 pages...
parsed 363800 pages...
parsed 363900 pages...
parsed 364000 pages...
parsed 364100 pages...
parsed 364200 pages...
parsed 364300 pages...
parsed 364400 pages...
parsed 364500 pages...
parsed 364600 pages...
parsed 364700 pages...
parsed 364800 pages...
parsed 364900 pages...
parsed 365000 pages...
parsed 365100 pages...
parsed 365200 pages...
parsed 365300 pages...
parsed 365400 pages...
parsed 365500 pages...
parsed 365600 pages...
parsed 365700 pages...
parsed 365800 pages...
parsed 365900 pages...
parsed 366000 pages...
parsed 366100 pages...
parsed 366200 pages...
parsed 366300 pages...
parsed 366400 pages...
parsed 366500 pages...
parsed 366600 pages...
parsed 366700 pages...
parsed 366800 pages...
parsed 3669

parsed 398300 pages...
parsed 398400 pages...
parsed 398500 pages...
parsed 398600 pages...
parsed 398700 pages...
parsed 398800 pages...
parsed 398900 pages...
parsed 399000 pages...
parsed 399100 pages...
parsed 399200 pages...
parsed 399300 pages...
parsed 399400 pages...
parsed 399500 pages...
parsed 399600 pages...
parsed 399700 pages...
parsed 399800 pages...
parsed 399900 pages...
parsed 400000 pages...
parsed 400100 pages...
parsed 400200 pages...
parsed 400300 pages...
parsed 400400 pages...
parsed 400500 pages...
parsed 400600 pages...
parsed 400700 pages...
parsed 400800 pages...
parsed 400900 pages...
parsed 401000 pages...
parsed 401100 pages...
parsed 401200 pages...
parsed 401300 pages...
parsed 401400 pages...
parsed 401500 pages...
parsed 401600 pages...
parsed 401700 pages...
parsed 401800 pages...
parsed 401900 pages...
parsed 402000 pages...
parsed 402100 pages...
parsed 402200 pages...
parsed 402300 pages...
parsed 402400 pages...
parsed 402500 pages...
parsed 4026

parsed 434200 pages...
parsed 434300 pages...
parsed 434400 pages...
parsed 434500 pages...
parsed 434600 pages...
parsed 434700 pages...
parsed 434800 pages...
parsed 434900 pages...
parsed 435000 pages...
parsed 435100 pages...
parsed 435200 pages...
parsed 435300 pages...
parsed 435400 pages...
parsed 435500 pages...
parsed 435600 pages...
parsed 435700 pages...
parsed 435800 pages...
parsed 435900 pages...
parsed 436000 pages...
parsed 436100 pages...
parsed 436200 pages...
parsed 436300 pages...
parsed 436400 pages...
parsed 436500 pages...
parsed 436600 pages...
parsed 436700 pages...
parsed 436800 pages...
parsed 436900 pages...
parsed 437000 pages...
parsed 437100 pages...


In [209]:
def resolve_redirects(parsed):
    for title, page in parsed.items():
        if 'links' in page:
            for token, link in page['links'].items():
                old_target = parsed.get(link['target'])
                new_target = old_target
                depth = 0
                while depth < 100 and new_target and 'redirect' in new_target:
                    new_target = parsed.get(new_target['redirect'])
                    depth += 1
                if new_target and not (old_target is new_target):
                    print(f'redirecting {old_target["title"]} -> {new_target["title"]}')
                    link['target'] = new_target['title']

In [211]:
resolve_redirects(parsed)

In [224]:
def remove_unresolved_links(parsed):
    num_resolved = 0
    num_unresolved = 0
    for title, page in parsed.items():
        if 'links' in page:
            unresolved = {}
            for token, link in page['links'].items():
                target = parsed.get(link['target'])
                if not target or 'redirect' in target:
                    num_unresolved += 1
                    unresolved[token] = link['text']
                else:
                    num_resolved += 1
            for token, text in unresolved.items():
                page['text'] = page['text'].replace(token, text)
                page['links'].pop(token, None)
    print(num_resolved, num_unresolved)

In [226]:
remove_unresolved_links(parsed)

1055666 0


In [7]:
def remove_redirect_pages(parsed):
    to_remove = [title for title, page in parsed.items() if 'redirect' in page]
    for title in to_remove:
        del parsed[title]
    print(len(to_remove))

In [6]:
remove_redirect_pages(parsed)

51330


In [227]:
with open('c:\\Users\\achang\\Downloads\\simplewiki-20171103.json', 'w') as f:
    json.dump(parsed, f, indent = 1)

In [3]:
with open('C:\\Users\\achang\\Downloads\\simplewiki-20171103.json') as f:
    parsed = json.load(f)

In [233]:
def count_links(parsed):
    result = defaultdict(int)
    for title, page in parsed.items():
        if 'links' in page:
            for token, link in page['links'].items():
                result[link['target']] += 1
    return result

In [237]:
link_counts = sorted(count_links(parsed).items(), key = lambda x: x[1], reverse = True)

In [240]:
link_counts[2000]

('NEW YORK YANKEES', 86)

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [11]:
tokenizer.tokenize(parsed_text)

['Devon is a {{0}} in southwest {{1}}.',
 'Sometimes, it is called Devonshire although this is not its correct name.',
 'Devon is the second largest county in England, and has the longest road network of any county in England.',
 'After the last {{2}}, Devon was one of the first places in England where people started to live.',
 '{{3}}s have found many old places in Devon with ancient buildings.',
 'For example, many ruins of old buildings have been found in an area called "{{4}}", which is now a {{5}}.',
 'Devon gets its name from the Dumnonii, a name that the invading {{6}} gave to the Celtic tribe in that area.',
 'The Romans invaded Devon about {{7}} 50.',
 'The name Dumnonii means "a person who lives in a deep {{8}}", and it comes from the {{9}}s and valleys of the area.',
 'The Roman {{10}} stayed in Devon for about 25 years.',
 'Their {{11}} was in the city of {{12}}.',
 'It was a long time before anyone else invaded Devon.',
 '{{13}} came to Devon in the {{14}}, and the King of