In [1]:
import json
import gzip
import nltk
import re
from intervaltree import Interval, IntervalTree
from tqdm import tqdm_notebook

In [2]:
with gzip.open('../data/simplewiki/simplewiki-20171103.parsed.json.gz', 'rt', encoding='utf-8') as f:
    wiki = json.load(f)

In [51]:
# with open('../data/simplewiki/simplewiki-20171103.vocab_30k.txt', 'rt', encoding='utf-8') as f:
#     id_to_word_30k = [line.strip() for line in f]
# word_to_id_30k = dict((word, word_id) for word_id, word in enumerate(id_to_word_30k))

In [3]:
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text).lower()
    text = text.replace("``", '"')
    text = text.replace("''", '"')
    return text

def normalize_page(page):
    page_id = page['id']
    page_links = page['links']
    page_text = page['text']
    
    page_text = normalize_text(page_text)
    
    norm_frags = []
    norm_links = []
    
    pattern = re.compile(r'\{\{\d+\}\}')
    
    page_offset = 0
    norm_offset = 0

    while True:
        match = pattern.search(page_text, page_offset)
        if not match:
            break
        page_link = page_links[match.group()]
        
        norm_frags.append(page_text[page_offset:match.start()])
        norm_offset += len(norm_frags[-1])
        norm_link_start = norm_offset
        
        norm_frags.append(normalize_text(page_link['text']))
        norm_offset += len(norm_frags[-1])
        norm_link_end = norm_offset
        
        norm_links.append({
            'target': page_link['target'],
            'start': norm_link_start,
            'end': norm_link_end,
        })
        
        page_offset = match.end()
    
    norm_frags.append(page_text[page_offset:])
    norm_text = ''.join(norm_frags)
    
    return {
        'id': page_id,
        'links': norm_links,
        'text': norm_text
    }

In [4]:
norm_wiki = dict((title, normalize_page(page)) for title, page in tqdm_notebook(wiki.items()))




In [19]:
with gzip.open('../data/simplewiki/simplewiki-20171103.parsed.norm.json.gz', 'wt', encoding='utf-8') as f:
    json.dump(norm_wiki, f, indent = 1)