In [1]:
import re
from xml.sax import make_parser
from xml.sax.saxutils import XMLFilterBase, XMLGenerator
import bz2
import logging
from pathlib import Path
LOGDIR = Path("logs")
LOGDIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(filename=LOGDIR.joinpath("parse.log"),  level=logging.DEBUG)


class Project1Filter(XMLFilterBase):
    """This decides which SAX events to forward to the ContentHandler

    We will not forward events when we are inside any elements with a
    name specified in the 'tags_names_to_exclude' parameter
    """

    def __init__(self, tag_names_to_include, parent=None):
        super().__init__(parent)

        # set of tag names to exclude
        self._tag_names_to_include = tag_names_to_include

        # _valid_tag keeps track of opened project1 elements
        self._valid_tag = False
        self.current_tag = ''
        self.current_title = ''
        self.current_text = ''
        self.coord_found = False
        self.regex = re.compile('{{[Cc]oord.*?}}')


    def startElement(self, name, attrs):
        if name in self._tag_names_to_include:
            self._valid_tag = True
            if name == 'title':
                self.current_tag = name
            elif name == 'text':
                self.current_tag = name

    def endElement(self, name):
        if name in self._tag_names_to_include:
            if self.current_tag == 'text' and self.coord_found:
                logging.info(f"{self.current_title}")
                super().startElement('page', {})
                ###########
                super().startElement('title', {})
                super().characters(self.current_title)
                super().endElement('title')
                ############
                coords = self.regex.search(self.current_text).group()
                super().startElement('coords', {})
                super().characters(coords)
                super().endElement('coords')
                ############
                super().startElement('text', {})
                super().characters(self.current_text)
                super().endElement('text') 
                super().characters('\n')
                ############
                super().endElement('page')

                self.current_tag = ''
                self.current_title = ''
                self.current_text = ''
                self._valid_tag = False
                self.coord_found = False
                
            if self.current_tag == 'text' and not self.coord_found:
                self.current_tag = ''
                self.current_title = ''
                self.current_text = ''
                self._valid_tag = False
                self.coord_found = False

            if self.current_tag == 'title': 
                self.current_tag = ''
                self._valid_tag = False


    def characters(self, content):
        if self._valid_tag:
            if self.current_tag == 'title':
                self.current_title += content

            elif self.current_tag == 'text':
                self.current_text += content
                if not self.coord_found:
                    if self.regex.search(self.current_text):
                        self.coord_found = True


tag_names_to_include = {'title', 'text'}
reader = Project1Filter(tag_names_to_include, make_parser())

with open('data/wiki_coords_utf8.xml', 'w') as f:
    handler = XMLGenerator(f, encoding='utf-8')
    reader.setContentHandler(handler)
    fp = bz2.open('../../data/enwiki-latest-pages-articles-multistream.xml.bz2')
    reader.parse(fp)
