### Parse Wikipedia

In [1]:
from time import time
from os.path import join
import csv
import locale
locale.setlocale(locale.LC_ALL, '')

from constants import DATA_BASE

# Warning: The xml.etree.ElementTree module is not secure against maliciously constructed data. 
# If you need to parse untrusted or unauthenticated data see XML vulnerabilities 
# (https://docs.python.org/3/library/xml.html#xml-vulnerabilities)
import xml.etree.ElementTree as ET

In [24]:
PATH_WIKI_XML = join(DATA_BASE, 'dewiki')
FILENAME_WIKI = 'dewiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'

pathWikiXML = join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = join(PATH_WIKI_XML, FILENAME_TEMPLATE)

In [38]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag(tag):
    return tag.split('}', 1)[1] if '}' in tag else tag

def split_title(title):
    split = title.find('(', 1)
    split = None if split < 1 else split
    return title[:split], (title[split:] if split else '')


In [144]:
import re
from w3lib.html import remove_tags, remove_tags_with_content
from html import unescape

box = re.compile(r"{{(.|\n)*}}")
datei = re.compile(r"\[{2}Datei:(.|\n)*\]{2}")
remove1 = r"|".join([r"\'{2,}", r"\[{2,}" , r"\]{2,}"])
remove2 = r"|".join([r"\n={2,} ", r" ={2,}\n"])
re1 = re.compile(remove1)
re2 = re.compile(remove2)

def parse_text(text):
    text = unescape(text)
    text = remove_tags(remove_tags_with_content(text, ('ref',)))
    text = box.sub('', text)
    text = datei.sub('', text)
    text = re1.sub('', text)
    text = re2.sub('\n', text)
    return text[:1000]

In [146]:
articleCount = 0
row = None
is_article = False
is_liste = False

print('Starting ...')

t0 = time()

with open(pathWikiXML, 'r') as fr, open(pathArticles, 'w') as fw:
    
    fields = ['id', 'title', 'subtitle', 'text', 'timestamp', 'categories', 'redirect']
    writer = csv.DictWriter(fw, fields, quoting=csv.QUOTE_MINIMAL)
    writer.writeheader()

    for event, elem in ET.iterparse(fr, events=['start', 'end']):
        tag = strip_tag(elem.tag)
        #print(i, tag, elem.attrib)

        if event == 'start':
            # start new row
            if tag == 'page':
                row = dict()
        else:
            if tag == 'ns' and int(elem.text) == 0 and not is_liste:
                is_article = True
            elif tag == 'redirect':
                row[tag] = elem.get('title')
            elif tag == 'timestamp':
                row[tag] = elem.text
            elif tag == 'id' and tag not in row:
                row[tag] = elem.text
            elif tag == 'title':
                if elem.text.startswith('Liste von'):
                    is_liste = True
                row[tag], row['subtitle'] = split_title(elem.text)
            elif tag == 'text' and 'redirect' not in row:
                row[tag] = parse_text(elem.text)
                if not row[tag]:
                    is_article = False
            # write and close row
            elif tag == 'page':
                if is_article:
                    writer.writerow(row)
                    articleCount += 1
                    if articleCount > 1 and (articleCount % 100) == 0:
                        print(locale.format("%d", articleCount, grouping=True))
                row = None
                is_article = is_liste = False
            elem.clear()
            
        if articleCount == 1000:
            break
            
        
t1 = time() - t0

print("Article pages:", locale.format("%d", articleCount, grouping=True))
print("Elapsed time:", hms_string(t1))

Starting ...
100
200
300
400
500
600
700
800
900
1.000
Article pages: 1.000
Elapsed time: 0:00:02.59


In [27]:
!jt -t solarizedd