In [1]:
from glob import glob
import os

keras_home = '/users/antoine.bres/.keras/datasets/'
data_paths = glob(keras_home + '*.bz2')
data_paths

['/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles-multistream5.xml-p4592356p6092355.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles6.xml-p7494129p8994128.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles1.xml-p1p275787.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles-multistream1.xml-p1p275787.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles-multistream4.xml-p2516882p4016881.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles-multistream4.xml-p4016882p4592355.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles3.xml-p927546p2427545.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles-multistream3.xml-p2427546p2516881.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles6.xml-p10494129p11994128.bz2',
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles-multistream

In [2]:
import bz2
# enumerate(bz2.BZ2File(data_path, 'r'))
data_path = data_paths[2]
data_path

'/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles1.xml-p1p275787.bz2'

In [3]:
import re
import mwparserfromhell

def process_article(title, text, timestamp, template = 'Infobox Langage de programmation'):
    """Process a wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)
    
    # Filter out errant matches
    matches = [x for x in matches if re.findall(template.lower(), x.name.strip_code().strip().lower())]
    
    if len(matches) >= 1:
        # template_name = matches[0].name.strip_code().strip()

        # Extract information from infobox
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in matches[0].params
                      if param.value.strip_code().strip()}

        # Extract internal wikilinks
        wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]

        # Extract external links
        exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]

        # Find approximate length of article
        text_length = len(wikicode.strip_code().strip())

        return (title, properties, wikilinks, exlinks, timestamp, text_length)

In [4]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self, template_names):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._programation_languages = []
        self._article_count = 0
        self._non_matches = []
        self._template_string = "|".join(template_names)

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is a book
            prog_lang = process_article(**self._values, template=self._template_string)
            # Append to the list of books
            if prog_lang:
                self._programation_languages.append(prog_lang)

In [5]:
from timeit import default_timer as timer

In [6]:
os.cpu_count()

4

In [7]:
import gc
import json

def get_output_name(data_path, partition_dir):
    # Create file name based on partition name
    p_str = data_path.split('-')[-1].split('.')[-2]
    return partition_dir + f'{p_str}.ndjson'

In [8]:
def find_prog_languages(data_path, limit=None, save=True):
    """Find all the programation languages articles from a compressed wikipedia XML dump.
       `limit` is an optional argument to only return a set number of books.
        If save, books are saved to partition directory based on file name"""

    partition_dir = '../data/wiki/partitions/'
    out_dir = get_output_name(data_path, partition_dir)
    if not os.path.exists(out_dir):
        # Object for handling xml
        handler = WikiXmlHandler(template_names=['Infobox Langage de programmation'])
        # template_names=['Infobox Langage de programmation','Infobox Logiciel',"Infobox Système d'exploitation"]
        # Parsing object
        parser = xml.sax.make_parser()
        parser.setContentHandler(handler)

        # Iterate through compressed file
        for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
            try:
                parser.feed(line)
            except StopIteration:
                break
            
            # Optional limit
            if limit is not None and len(handler._programation_languages) >= limit:
                return handler._programation_languages

        if save:
            # Open the file
            with open(out_dir, 'w') as fout:
                # Write as json
                for lang in handler._programation_languages:
                    fout.write(json.dumps(lang) + '\n')


        # Memory management
        del handler
        del parser
        gc.collect()
        return None
    print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

In [9]:
partitions = [keras_home + file for file in os.listdir(keras_home) if 'xml-p' in file and 'multistream' not in file]
len(partitions), partitions[-1]

(12,
 '/users/antoine.bres/.keras/datasets/frwiki-20200301-pages-articles5.xml-p6092356p7494128.bz2')

In [11]:
from multiprocessing import Pool 
import tqdm 

# List of lists to single list
from itertools import chain

# Sending keyword arguments in map
from functools import partial

In [12]:
# Create a pool of workers to execute processes
pool = Pool(processes = 4)

start = timer()

# Map (service, tasks), applies function to each partition
results = pool.map(find_prog_languages, partitions)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')

3391.079689787992 seconds elapsed.
