### Parse Wikipedia

In [None]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os

In [None]:
PATH_WIKI_XML = join(DATA_BASE, 'dewiki')
FILENAME_WIKI = 'dewiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"

pathWikiXML = join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = join(PATH_WIKI_XML, FILENAME_TEMPLATE)

In [None]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [None]:
import locale
locale.setlocale(locale.LC_ALL, '')
locale.format("%d", 1255000, grouping=True)

In [None]:
# taken from
# https://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html
# => https://github.com/jeffheaton/article-code/blob/master/python/wikipedia/wiki-basic-stream.py
# but is not working correctly

totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None

print('start')

start_time = time.time()

with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'redirect'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)
        print('event', event)
        print('tname', tname)
        print('text', elem.text)
        print()

        if event == 'start':
            if tname == 'page':
                title = ''
                id_ = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True

        else:
            if tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                if elem.text:
                    id_ = int(elem.text)
                else:
                    id_ = None
            elif tname == 'redirect':
                if 'title' in elem.attrib:
                    redirect = elem.attrib['title']
                else:
                    redirect = ''
            elif tname == 'ns':
                if elem.text:
                    ns = int(elem.text)
                else:
                    ns = None
            elif tname == 'page':
                totalCount += 1
                if totalCount > 1 and (totalCount % 100000) == 0:
                    print(locale.format("%d", totalCount, grouping=True))

                if ns == 10:
                    templateCount += 1
                    templateWriter.writerow([id_, title])
                elif len(redirect) > 0:
                    articleCount += 1
                    articlesWriter.writerow([id_, title, redirect])
                else:
                    redirectCount += 1
                    redirectWriter.writerow([id_, title, redirect])
                    
                if totalCount > 10:
                    break

            elem.clear()
        
elapsed_time = time.time() - start_time

print(locale.format("Total pages: %d", totalCount, grouping=True))
print(locale.format("Template pages: %d", templateCount, grouping=True))
print(locale.format("Article pages: %d", articleCount, grouping=True))
print(locale.format("Redirect pages: %d", templateCount, grouping=True))
print(locale.format("Elapsed time: %d", hms_string(elapsed_time), grouping=True))


In [None]:
# TODO: make changes

totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None

print('start')

start_time = time.time()

with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'redirect'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)
        print('event', event)
        print('tname', tname)
        print('text', elem.text)
        print()

        if event == 'start':
            if tname == 'page':
                title = ''
                id_ = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True

        else:
            if tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                if elem.text:
                    id_ = int(elem.text)
                else:
                    id_ = None
            elif tname == 'redirect':
                if 'title' in elem.attrib:
                    redirect = elem.attrib['title']
                else:
                    redirect = ''
            elif tname == 'ns':
                if elem.text:
                    ns = int(elem.text)
                else:
                    ns = None
            elif tname == 'page':
                totalCount += 1
                if totalCount > 1 and (totalCount % 100000) == 0:
                    print(locale.format("%d", totalCount, grouping=True))

                if ns == 10:
                    templateCount += 1
                    templateWriter.writerow([id_, title])
                elif len(redirect) > 0:
                    articleCount += 1
                    articlesWriter.writerow([id_, title, redirect])
                else:
                    redirectCount += 1
                    redirectWriter.writerow([id_, title, redirect])
                    
                if totalCount > 10:
                    break

            elem.clear()
        
elapsed_time = time.time() - start_time

print(locale.format("Total pages: %d", totalCount, grouping=True))
print(locale.format("Template pages: %d", templateCount, grouping=True))
print(locale.format("Article pages: %d", articleCount, grouping=True))
print(locale.format("Redirect pages: %d", templateCount, grouping=True))
print(locale.format("Elapsed time: %d", hms_string(elapsed_time), grouping=True))
