In [1]:
import json
import gzip
import nltk.data
import re
from tqdm import tqdm_notebook

In [2]:
with gzip.open('../data/simplewiki/simplewiki-20171103.parsed.json.gz', 'rt', encoding='utf-8') as f:
    wiki = json.load(f)

In [3]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [4]:
def paragraph_tokenize(text):
    return [para for para in text.split('\n') if len(para.strip()) > 0]

In [10]:
def parse_sentences(wiki):
    result = []
    
    for title, page in tqdm_notebook(wiki.items()):
        page_id = page['id']
        page_text = page['text']
        page_links = page['links']
        
        for para_id, para_text in enumerate(paragraph_tokenize(page_text)):
            for sentence_id, sentence_text in enumerate(sentence_tokenizer.tokenize(para_text)):
                src_offset = 0
                dst_offset = 0
                links = []
                strs = []

                for match in re.finditer(r'\{\{\d+\}\}', sentence_text):
                    token = sentence_text[match.start():match.end()]
                    link = page_links[token]

                    src_fragment = sentence_text[src_offset:match.start()]

                    link_target = link['target']
                    link_text = link['text']
                    link_start = dst_offset + len(src_fragment)
                    link_finish = link_start + len(link_text)

                    strs.append(src_fragment)
                    strs.append(link_text)

                    links.append({
                        'start': link_start,
                        'finish': link_finish,
                        'target': link_target
                    })

                    src_offset = match.end()
                    dst_offset += len(src_fragment) + len(link_text)

                strs.append(sentence_text[src_offset:])

                result.append({
                    'page_id': page_id,
                    'para_id': para_id,
                    'sentence_id': sentence_id,
                    'text': ''.join(strs),
                    'links': links
                })
                
    return result

In [11]:
sentences = parse_sentences(wiki)




In [12]:
with gzip.open('../data/simplewiki/simplewiki-20171103.sentences.json.gz', 'wt', encoding='utf-8') as f:
    json.dump(sentences, f, indent = 1)