In [1]:
import requests
from bs4 import BeautifulSoup
import os
from multiprocessing import Pool 
import tqdm 
import json
from itertools import chain
from functools import partial

import sys
from keras.utils import get_file

import bz2
import subprocess
import xml.sax
import mwparserfromhell 
import re
import gc
import json

from timeit import default_timer as timer

root_path = '/Users/lifengwei/.keras/datasets/'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

['../',
 '20190701/',
 '20190720/',
 '20190801/',
 '20190820/',
 '20190901/',
 '20190920/',
 '20191001/',
 'latest/']

In [3]:
dump_url = base_url + dumps[-2]
dump_html = requests.get(dump_url).text
soup_dump = BeautifulSoup(dump_html, 'html.parser')
link_list = soup_dump.find_all('li', {'class': 'file'})

In [4]:
files = []

# Search through all files
for file in link_list:
    text = file.text
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

[('enwiki-20191001-pages-articles-multistream.xml.bz2', ['16.3', 'GB']),
 ('enwiki-20191001-pages-articles-multistream-index.txt.bz2', ['205.7', 'MB']),
 ('enwiki-20191001-pages-articles-multistream1.xml-p10p30302.bz2',
  ['173.0', 'MB']),
 ('enwiki-20191001-pages-articles-multistream-index1.txt-p10p30302.bz2',
  ['163', 'KB']),
 ('enwiki-20191001-pages-articles-multistream2.xml-p30304p88444.bz2',
  ['205.8', 'MB'])]

In [5]:
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
files_to_download[-5:]

['enwiki-20191001-pages-articles27.xml-p54663464p56163464.bz2',
 'enwiki-20191001-pages-articles27.xml-p56163464p57663464.bz2',
 'enwiki-20191001-pages-articles27.xml-p57663464p59163464.bz2',
 'enwiki-20191001-pages-articles27.xml-p59163464p60663464.bz2',
 'enwiki-20191001-pages-articles27.xml-p60663464p61937279.bz2']

In [6]:
data_paths = []
file_info = []

L = len(files_to_download)

# Iterate through each file
for file in tqdm.tqdm(files_to_download):
    path = root_path + file
    
    # Check to see if the path exists (if the file is already downloaded)
    if not os.path.exists(root_path + file):
        # If not, download the file
        data_paths.append(get_file(file, dump_url + file))
        # Find the file size in MB
        file_size = os.stat(path).st_size / (1024*1024)
        
        file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file, file_size, file_articles))
        
    else:
        # If the file is already downloaded find some information
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / (1024*1024)
        
        file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file.split('-')[-1], file_size, file_number))

100%|██████████| 114/114 [00:00<00:00, 47379.18it/s]


In [7]:
def process_article(title, text, timestamp, template = 'Infobox book'):
    """Process a wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    matches = wikicode.filter_templates(matches = template)
    
    # Filter out errant matches
    matches = [x for x in matches if x.name.strip_code().strip().lower() == template.lower()]
    
    if len(matches) >= 1:
        # template_name = matches[0].name.strip_code().strip()

        # Extract information from infobox
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in matches[0].params
                      if param.value.strip_code().strip()}

        # Extract internal wikilinks
        wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]

        # Extract external links
        exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]

        # Find approximate length of article
        text_length = len(wikicode.strip_code().strip())

        return (title, properties, wikilinks, exlinks, timestamp, text_length)

In [8]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._books = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the page is a book
            book = process_article(**self._values, template = 'Infobox book')
            # Append to the list of books
            if book:
                self._books.append(book)

In [9]:
def find_books(data_path, limit = None, save = True):
    """Find all the book articles from a compressed wikipedia XML dump.
       `limit` is an optional argument to only return a set number of books.
        If save, books are saved to partition directory based on file name"""

    # Object for handling xml
    handler = WikiXmlHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    # Iterate through compressed file
    for i, line in enumerate(subprocess.Popen(['bzcat'], 
                             stdin = open(data_path), 
                             stdout = subprocess.PIPE).stdout):
        try:
            parser.feed(line)
        except StopIteration:
            break
            
        # Optional limit
        if limit is not None and len(handler._books) >= limit:
            return handler._books
    
    if save:
        partition_dir = './data/'
        # Create file name based on partition name
        p_str = data_path.split('-')[-1].split('.')[-2]
        out_dir = partition_dir + f'{p_str}.ndjson'

        # Open the file
        with open(out_dir, 'w') as fout:
            # Write as json
            for book in handler._books:
                fout.write(json.dumps(book) + '\n')
        
        print(f'{len(os.listdir(partition_dir))} files processed.', end = '\r')

    # Memory management
    del handler
    del parser
    gc.collect()
    return None

In [10]:
partitions = [root_path + file for file in os.listdir(root_path) if 'xml-p' in file]
len(partitions), partitions[-1]

(114,
 '/Users/lifengwei/.keras/datasets/enwiki-20191001-pages-articles-multistream8.xml-p1268693p1791079.bz2')

In [11]:
os.cpu_count()

12

In [12]:
pool = Pool(processes = os.cpu_count()-1)

start = timer()

# Map (service, tasks), applies function to each partition
results = pool.map(find_books, partitions)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')

27764.362621568995 seconds elapsed.


In [2]:
def read_data(file_path):
    """Read in json data from `file_path`"""
    
    data = []
    
    # Open the file and load in json
    with open(file_path, 'r') as fin:
        for l in fin.readlines():
            data.append(json.loads(l))
            
    return data

In [3]:
from multiprocessing.dummy import Pool as Threadpool
from itertools import chain

current_path = os.getcwd() + '/'

start = timer()

# List of files to read in
saved_files = [current_path + 'data/' + x for x in os.listdir(current_path + 'data/')]
saved_files = [file for file in saved_files if file.endswith('ndjson')]

# Create a threadpool for reading in files
threadpool = Threadpool(processes = 10)

# Read in the files as a list of lists
results = threadpool.map(read_data, saved_files)

# Flatten the list of lists to a single list
book_list = list(chain(*results))

end = timer()

print(f'Found {len(book_list)} books in {round(end - start)} seconds.')

Found 38687 books in 1 seconds.


In [4]:
if not os.path.exists(current_path + 'found_books_filtered.ndjson'):
    with open(current_path + 'found_books_filtered.ndjson', 'w+') as fout:
        for book in book_list:
             fout.write(json.dumps(book) + '\n')
    print('Books saved.')
else:
    print('Files already saved.')

Files already saved.
