In [1]:
import chardet
import os
import concurrent.futures
import time
from bs4 import BeautifulSoup
from collections import Counter
import re

In [2]:
os.listdir("wiki")

['Furubira_District,_Hokkaido.html',
 'Valentin_Yanin.html',
 'Kings_XI_Punjab_in_2014.html',
 'William_Harvey_Lillard.html',
 'Radial_Road_3.html',
 'George_Weldrick.html',
 'Zgornji_Otok.html',
 'Blue_Heelers_(season_8).html',
 'Taggen_Nunatak.html',
 '1951_National_League_tie-breaker_series.html',
 'List_of_number-one_singles_of_1993_(Finland).html',
 'Vrila.html',
 'William_Henry_Porter.html',
 'Clive_Brown_(footballer).html',
 '2010_Karshi_Challenger_%E2%80%93_Singles.html',
 'Blick_nach_Rechts.html',
 'Central_District_(Rezvanshahr_County).html',
 'Gal%C3%A1pagos,_Guadalajara.html',
 'Campus_of_Texas_A%26M_University.html',
 'Alexios_Aspietes.html',
 'Mei_Lanfang.html',
 'Thalkirchen-Obersendling-Forstenried-F%C3%BCrstenried-Solln.html',
 'Coalville_Town_railway_station.html',
 'Gennady_Lesun.html',
 'Bartrum_Glacier.html',
 'Victor_S._Mamatey.html',
 'Gottfried_Keller.html',
 'Table_Point_Formation.html',
 'Nobuhiko_Ushiba.html',
 'Master_of_Space_and_Time.html',
 'Early_medieva

In [3]:
len(os.listdir("wiki"))

999

In [4]:
with open("wiki/1915_Montana_football_team.html", mode='rb') as f:
    raw_bytes = f.read()
    detected_encoding = chardet.detect(raw_bytes)
    print(detected_encoding, '\n', '-'*60)
    print(raw_bytes)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''} 
 ------------------------------------------------------------
b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>1915 Montana football team - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"1915_Montana_football_team","wgTitle":"1915 Montana football team","wgCurRevisionId":747337042,"wgRevisionId":747337042,"wgArticleId":34186807,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["1915 NCAA football season","Montana Grizzlies football seasons"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSepar

It appears that the main content is nested inside the div tag with the id content.
# Reading In The Data


In [5]:
pool = concurrent.futures.ThreadPoolExecutor(max_workers=2)

def read_data(filename):
    with open(filename, mode='rb') as f:
        data = f.read()
    return data

start = time.time()
filenames = ["wiki/{}".format(f) for f in os.listdir("wiki")]
content = pool.map(read_data, filenames)
content = list(content)

end = time.time()
print(end - start)
articles = [f.replace(".html", "").replace("wiki/", "") for f in filenames]

0.13100385665893555


After doing some profiling, it doesn't appear that threading makes a huge difference to performance. It may be because although files are opened, most of the task is offset by the overhead of creating new threads.

# Remove Extraneous Markup

In [6]:
def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return str(soup.find_all("div", id="content")[0])

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
parsed = pool.map(parse_html, content)
parsed = list(parsed)
end = time.time()

print(end - start)

27.345690488815308


In [7]:
parsed[0]

'<div class="mw-body" id="content" role="main">\n<a id="top"></a>\n<div id="siteNotice"><!-- CentralNotice --></div>\n<div class="mw-indicators">\n</div>\n<h1 class="firstHeading" id="firstHeading" lang="en">Furubira District, Hokkaido</h1>\n<div class="mw-body-content" id="bodyContent">\n<div id="siteSub">From Wikipedia, the free encyclopedia</div>\n<div id="contentSub"></div>\n<div class="mw-jump" id="jump-to-nav">\n\t\t\t\t\tJump to:\t\t\t\t\t<a href="#mw-head">navigation</a>, \t\t\t\t\t<a href="#p-search">search</a>\n</div>\n<div class="mw-content-ltr" dir="ltr" id="mw-content-text" lang="en"><table class="plainlinks metadata ambox ambox-content ambox-Unreferenced" role="presentation">\n<tr>\n<td class="mbox-image">\n<div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upl

This operation is quite slow and CPU-intensive. It looks like using as many processes are there are available processors speeds things up.
# Finding Common Tags

In [8]:
def count_tags(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = {}
    for tag in soup.find_all():
        if tag.name not in tags:
            tags[tag.name] = 0
        tags[tag.name] += 1
    return tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
tags = pool.map(count_tags, parsed)
tags = list(tags)

tag_counts = {}
for tag in tags:
    for k,v in tag.items():
        if k not in tag_counts:
            tag_counts[k] = 0
        tag_counts[k] += v
end = time.time()

print(end - start)
tag_counts

13.769802808761597


{'a': 161065,
 'abbr': 3665,
 'annotation': 2,
 'area': 39,
 'audio': 2,
 'b': 14455,
 'bdi': 4,
 'big': 75,
 'blockquote': 58,
 'br': 4986,
 'caption': 200,
 'center': 64,
 'cite': 3563,
 'code': 108,
 'dd': 1376,
 'del': 2,
 'div': 28581,
 'dl': 457,
 'dt': 334,
 'font': 40,
 'h1': 999,
 'h2': 4045,
 'h3': 777,
 'h4': 117,
 'h5': 4,
 'h6': 1,
 'hr': 51,
 'i': 18246,
 'img': 6701,
 'li': 85779,
 'map': 2,
 'math': 2,
 'mo': 2,
 'mrow': 2,
 'mstyle': 2,
 'noscript': 999,
 'ol': 858,
 'p': 7998,
 'pre': 1,
 'q': 76,
 'rb': 16,
 'rp': 32,
 'rt': 16,
 'ruby': 16,
 's': 10,
 'samp': 2,
 'semantics': 2,
 'small': 3272,
 'source': 2,
 'span': 67350,
 'strong': 599,
 'sub': 151,
 'sup': 11157,
 'table': 4010,
 'td': 57673,
 'th': 14472,
 'tr': 27300,
 'u': 51,
 'ul': 10972,
 'wbr': 85}

Based on our findings, it looks like there are quite a few td, a, li, and span tags. This indicates that articles tend to have lots of links, along with lists and tables. Links are the most numerous tag, which indicates how interconnected articles on Wikipedia are.
# Finding Common Words

In [9]:
def count_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower())
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5]
    return Counter(words).most_common(10)

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words = pool.map(count_words, parsed)
words = list(words)

word_counts = {}
for wc in words:
    for word, count in wc:
        if word not in word_counts:
            word_counts[word] = 0
        word_counts[word] += 1
end = time.time()

print(end - start)
word_counts

14.210116147994995


{'county': 66,
 'planets': 2,
 'singing': 2,
 'loved': 1,
 'goran': 1,
 'surface': 2,
 'sweden': 2,
 'gregory': 2,
 'byzantine': 1,
 'burrell': 1,
 '10000': 1,
 'sybra': 1,
 'mcadams': 1,
 'waldau': 1,
 'serge': 1,
 'pastime': 1,
 'indonesia': 1,
 'kingdoms': 1,
 'authentication': 1,
 '01333': 1,
 'january': 27,
 'leake': 1,
 '51333': 1,
 'malay': 1,
 'forewings': 1,
 'allee': 1,
 'gastropoda': 1,
 'incumbent': 1,
 'hertford': 1,
 'dolni': 2,
 'prestonwood': 1,
 'lancashire': 1,
 'rheumatoid': 1,
 'groves': 1,
 '48056': 1,
 'darts': 1,
 'lille': 1,
 'sites': 3,
 'funding': 1,
 'creek': 5,
 'elite': 1,
 'submerged': 1,
 'ishikari': 1,
 'faircross': 1,
 'triathlon': 1,
 'aires': 1,
 'garza': 1,
 'ministers': 1,
 'maneswar': 1,
 'teiji': 1,
 '354842': 1,
 'issue': 1,
 'richard': 5,
 'conservative': 3,
 'genus': 10,
 'blundell': 1,
 'kapoor': 2,
 'mucci': 1,
 'arroyito': 1,
 'observatory': 1,
 'argentine': 1,
 'performed': 2,
 'brazil': 2,
 'fuchs': 1,
 'discussion': 1,
 'pozoblanco': 1,
 

Only selecting the top 10 words from each article speeds up performance quite a bit.