In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup  # Parsing HTML

In [3]:
base_url = 'https://ftp.acc.umu.se/mirror/wikimedia.org/dumps/enwiki/20200101/'
dump_html = requests.get(base_url).text
soup_dump = BeautifulSoup(dump_html, 'html.parser')

soup_dump.find_all('td', {'class': 'indexcolname'}, limit = 10)[:4]

[<td class="indexcolname"><a href="/mirror/wikimedia.org/dumps/enwiki/">Parent Directory</a></td>,
 <td class="indexcolname"><a href="dumpruninfo.json">dumpruninfo.json</a></td>,
 <td class="indexcolname"><a href="dumpruninfo.txt">dumpruninfo.txt</a></td>,
 <td class="indexcolname"><a href="dumpspecialfiles.json">dumpspecialfiles.json</a></td>]

In [4]:
import bz2
import subprocess

Get all the partitioned file paths for the whole wikipedia dump.

In [5]:
base_path = 'data/wiki_dumps/'

data_paths = []
file_info = []

In [6]:
for file in os.listdir(base_path):
    path = base_path + file
    data_paths.append(path)
    
    file_size = os.stat(path).st_size / 1e6
    file_info.append((file, file_size))

In [7]:
print('Amount of Partitions: ', len(data_paths))

Amount of Partitions:  58


In [8]:
sorted(file_info, key = lambda x: x[1], reverse=True)[:3]

[('enwiki-20200101-pages-articles12.xml-p3926864p5040435.bz2', 412.036518),
 ('enwiki-20200101-pages-articles14.xml-p6197599p7697599.bz2', 408.831179),
 ('enwiki-20200101-pages-articles13.xml-p5040438p6197593.bz2', 405.014561)]

In [9]:
file_path = data_paths[0]
file_path

'data/wiki_dumps/enwiki-20200101-pages-articles3.xml-p88445p200507.bz2'

In [10]:
counter = 0
millions_passed = 0

for line in subprocess.Popen(['bzcat'], stdin = open(file_path), stdout = subprocess.PIPE).stdout:
    counter += 1
    if counter % 1_000_000 == 999_999:
        millions_passed += 1
        print('Millions passed', millions_passed)

print('Total lines: ', counter)

Millions passed 1
Millions passed 2
Millions passed 3
Millions passed 4
Millions passed 5
Millions passed 6
Millions passed 7
Millions passed 8
Millions passed 9
Millions passed 10
Millions passed 11
Millions passed 12
Millions passed 13
Total lines:  13902783


In [27]:
# FIRST: write files into raw_unzipped folder

raw_unzipped_base_path = 'data/raw_unzipped/'
raw_unzipped_paths = []

partition_number = 1
for data_path in data_paths[:4]:
    
    start_time = time.time()
    
    raw_unzipped_path = f'{raw_unzipped_base_path}raw_partition_{partition_number}.txt'
    
    if not os.path.isfile(raw_unzipped_path):  # check if file already exists
        with open(raw_unzipped_path, 'w') as file:
            for i, line in enumerate(subprocess.Popen(['bzcat'], stdin = open(data_path), stdout = subprocess.PIPE).stdout):
                file.write(line.decode('utf-8'))
                if i > 100_000:
                    break
        elapsed_time = time.time() - start_time
        print('Raw partition no. {:2d} was finished in {:06.2f} seconds.'.format(partition_number, elapsed_time))
    else: 
        print('Raw partition no. {:2d} has already been created.'.format(partition_number))
    
    raw_unzipped_paths.append(raw_unzipped_path)
    partition_number += 1

Raw partition no.  1 has already been created.
Raw partition no.  2 has already been created.
Raw partition no.  3 has already been created.
Raw partition no.  4 has already been created.


In [29]:
raw_unzipped_paths

['data/raw_unzipped/raw_partition_1.txt',
 'data/raw_unzipped/raw_partition_2.txt',
 'data/raw_unzipped/raw_partition_3.txt',
 'data/raw_unzipped/raw_partition_4.txt']

In [None]:
with open(raw_unzipped_paths[3], 'r') as file:
    for i, line in enumerate(file):
        print(line)
        
        if i > 50:
            break

In [33]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'id'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text'], self._values['id']))

In [34]:
import json

In [45]:
import mwparserfromhell

xml_parsed_base_path = 'data/xml_parsed/'
xml_parsed_paths = []

partition_number = 1
for rup in raw_unzipped_paths[partition_number - 1:partition_number]:
    
    xml_parsed_path = f'{xml_parsed_base_path}page_xmlp_{partition_number}.jsonl'
    if not os.path.isfile(xml_parsed_path):
    
        # Content handler for Wiki XML
        handler = WikiXmlHandler()
        # Parsing object
        parser = xml.sax.make_parser()
        parser.setContentHandler(handler)

        with open(rup, 'r') as file:

            for line in file:
                parser.feed(line)
            
            
        with open(xml_parsed_path, 'w') as file:
            for page in handler._pages:
                # START
                if page[1][:10].lower().startswith('#redirect'):
                    continue
                    
                title = page[0]
                raw_text = page[1]
                ids = page[2]
                
                wikicode = mwparserfromhell.parse(raw_text)
                
                templates = wikicode.filter_templates()
                for template in templates:
                    
                    if tmp not in wikicode._nodes:
                        continue
                    
                    try:
                        wikicode.remove(template)
                    except:
                        pass
                
                text = wikicode.strip_code().strip()[:1000]

                tmp = {
                    "title": title,
                    "text": text,
                    "id": ids
                }

                json_tmp = json.dumps(tmp)
                file.write(json_tmp + '\n')

        print(f'Finished partition no. {partition_number}')
    else:
        print(f'Partition no. {partition_number} was already created')
    xml_parsed_paths.append(xml_parsed_path)
    partition_number += 1

Partition no. 1 was already created


In [31]:
xml_parsed_paths

['data/xml_parsed/page_xmlp_1.jsonl']

In [19]:
res = None
total_counter = 0
redirect_counter = 0
with open(xml_parsed_paths[0], 'r') as file:
    for i, line in enumerate(file):
        res = json.loads(line)
        title = res['title']
        text = res['text']
        
        total_counter += 1
        if text[:10].lower().startswith('#redirect'):
            redirect_counter +=1
        
#         print(f'{title:50}Text Length: {len(text):4d}\tSnippet: {repr(text)[:80]:80}')
#         if i == 30:
#             break
        print(f'{repr(text)[:150]:150}')
        if i == 10:
            break

print('Redirects', redirect_counter)
print('Percentage', redirect_counter / total_counter)

"'''MHS''' may refer to: \n \n ==Schools== \n *[[Manhattan High School]], Manhattan, Kansas, US \n *[[Marianas High School]], Saipan, CNMI, US \n * [[
"{{Unreferenced stub|auto=yes|date=December 2009}} \n '''PMI''' (Picture Music International) was a division of [[EMI]] that specialised in music vide
"{{Infobox holiday \n |holiday_name  = St Mark's Eve \n |type          =  \n |image         = \n |imagesize     = \n |caption       = \n |official_nam
'{{Infobox mountain \n | name = Mount Sopris West Peak \n | photo = Mtsop.JPG \n | photo_caption = Mount Sopris as viewed from [[Colorado State Highwa
'< div class= " boilerplate metadata vfd "  style= " background-color: #F3F9FF; margin: 0 auto; padding: 0 10px 0 10px; border: 1px solid #AAAAAA; " >
'\'\'\'Henry Preserved Smith\'\'\' (October 12, 1847  & ndash; February 26, 1927) was an American biblical scholar. \n \n Smith was born in [[Troy, Oh
"{{EngvarB|date=October 2017}} \n {{Use dmy dates|date=October 2017}} \n {{More citations need

In [24]:
import mwparserfromhell


wiki_parsed_base_path = 'data/wiki_parsed/'
wiki_parsed_paths = []

partition_number = 1

for xpp in xml_parsed_paths:
    
    wiki_parsed_path = f'{wiki_parsed_base_path}p_wikip_{partition_number}.jsonl'
    
    with open(xpp, 'r') as file:
        
        for i, line in enumerate(file):
            entry = json.loads(line)
            wiki = mwparserfromhell.parse(entry['text'])
            
#             print(entry['title'], '=' * 100)
#             print(wiki.strip_code().strip())
#             [x.remove for x in wiki.filter_templates()]
            if i == 3:
                print('THIS IS THE LINE BEING PRINTED', line)
            print('\n\n\n' + entry['title'], '=' * 100)
            print(wiki.strip_code().strip())
            if i == 30:
                break




MHS may refer to: 
 
 ==Schools== 
 *Manhattan High School, Manhattan, Kansas, US 
 *Marianas High School, Saipan, CNMI, US 
 * Marryatville High School, Adelaide, Australia 
 *Massapequa High School, Massapequa, New York, US 
 *Matthew Humberstone School, Cleethorpes, England 
 *Mauldin High School, Mauldin, South Carolina, US 
 *McMinnville High School, McMinnville, Oregon, US 
 *Melbourne High School, Melbourne, Australia 
 *Marysville High School (disambiguation) 
 
 ==Other uses== 
 *Air Memphis (ICAO code: MHS), an Egyptian airline 
 *Mars Helicopter Scout 
 *Master of Health Science, a graduate degree program 
 *Meadowhall Interchange, a railway station in England, National Rail station code 
 *Message Handling System, a past Novell email protocol  
 *Michigan Humane Society, in animal welfare 
 *Microwave Humidity Sounder, satellite-borne instrument 
 *Military Health System, of US DoD 
 *Modular Handgun System 
 *Montana Historical Society 
 
 ==See also== 
 *Mental health 

In [61]:
with open('extracted_text.txt', 'w') as the_file:
    the_file.write(handler._pages[1][1])
    the_file.write('\n')

In [116]:
# Object for handling xml
file_path = '/Users/christopher/Downloads/enwiki-20200101-pages-articles2.xml-p30304p88444.bz2'
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(file_path), 
                         stdout = subprocess.PIPE).stdout):
    parser.feed(line)
    
    # Stop when 9 articles have been found
    if len(handler._pages) > 9:
        break
        
print([x[0] for x in handler._pages])

['The X-Files', 'Third World', 'Twin Peaks', 'Thallium', 'Text editor', 'Tennis court', 'The Communist Manifesto', 'Trier', 'Ton', 'Talk (software)']


In [55]:
#print(handler._pages[2][1])

In [69]:
import mwparserfromhell 

# print(handler._pages[1][1])

# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[1][1], skip_style_tags=True)
wiki2 = mwparserfromhell.parse(handler._pages[1][1])

In [60]:
wiki.strip_code().strip()[:1000]

'During the Cold War, the term \'\'\'Third World\'\'\' referred to the developing countries of Asia, Africa, and Latin America, the nations not aligned with either the First World or the Second World. < ref name= " auto " >  < /ref > < ref >  < /ref >  This usage has become popular, mostly in the western countries, due to the ending of the Cold War. \n \n In the decade following the fall of the Soviet Union and the end of the Cold War in 1991, the term \'\'Third World\'\' was used interchangeably with \'\'developing countries\'\', but the concept has become outdated as it no longer represents the current political or economic state of the world. The three-world model arose during the Cold War to define countries aligned with NATO (the First World), the Eastern Bloc (the Second World, although this term was less used), or neither (the Third World). Strictly speaking,  " Third World "  was a political, rather than an economic, grouping. \n ==Etymology== \n French demographer, anthropolog

In [173]:
res_with_refs = wiki2.strip_code().strip()[:1000]
res_with_refs

'During the Cold War, the term Third World referred to the developing countries of Asia, Africa, and Latin America, the nations not aligned with either the First World or the Second World. < ref name= " auto " >  < /ref > < ref >  < /ref >  This usage has become popular, mostly in the western countries, due to the ending of the Cold War. \n \n In the decade following the fall of the Soviet Union and the end of the Cold War in 1991, the term Third World was used interchangeably with developing countries, but the concept has become outdated as it no longer represents the current political or economic state of the world. The three-world model arose during the Cold War to define countries aligned with NATO (the First World), the Eastern Bloc (the Second World, although this term was less used), or neither (the Third World). Strictly speaking,  " Third World "  was a political, rather than an economic, grouping. \n ==Etymology== \n French demographer, anthropologist and historian Alfred Sau

In [78]:
import re

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

In [171]:
cleanhtml(res_with_refs)[:1000]

'During the Cold War, the term Third World referred to the developing countries of Asia, Africa, and Latin America, the nations not aligned with either the First World or the Second World.        This usage has become popular, mostly in the western countries, due to the ending of the Cold War. \n \n In the decade following the fall of the Soviet Union and the end of the Cold War in 1991, the term Third World was used interchangeably with developing countries, but the concept has become outdated as it no longer represents the current political or economic state of the world. The three-world model arose during the Cold War to define countries aligned with NATO (the First World), the Eastern Bloc (the Second World, although this term was less used), or neither (the Third World). Strictly speaking,  " Third World "  was a political, rather than an economic, grouping. \n ==Etymology== \n French demographer, anthropologist and historian Alfred Sauvy, in an article published in the French mag

In [161]:
text = """{{About|the television series|the franchise|The X-Files (franchise)|other uses}}
{{short description|American science fiction TV series}}
{{Use mdy dates|date=March 2018}}
{{Infobox television
| show_name = The X-Files
| image = Thexfiles.jpg
| genre = {{Plain list|
* [[Science fiction]]{{sfn|Delsara|2000|p=59}}
* [[Horror fiction|Horror]]{{sfn|Delsara|2000|p=59}}
* [[Drama (film and television)|Drama]]&lt;ref&gt;{{cite web |url=http://www.allmovie.com/movie/the-x-files-season-01-v280868 |work=[[Allmovie]] |publisher=[[Rovi]] |title=The Files: Seasons 01 |author=Cooper, Tracie |accessdate=November 19, 2012}}&lt;/ref&gt;
* [[Mystery fiction|Mystery]]{{sfn|Delsara|2000|p=62}}
* [[Thriller (genre)|Thriller]]{{sfn|Delsara|2000|p=58}}
* [[Supernatural fiction]]&lt;ref&gt;{{cite book|editor1-last=Baugh|editor1-first=Lloyd|editor2-last=Mazza|editor2-first=Giuseppe|editor3-last=Srampickal|editor3-first=Jacob |title=Cross Connections |date=2006 |publisher=[[Pontifical Gregorian University]] |isbn=9788878390614 |page=201}}&lt;/ref&gt;
* [[Occult detective fiction]]}}"""

In [162]:
my_wiki = mwparserfromhell.parse(text)

In [164]:
my_wiki.filter_tags()

['*', '*', '*', '*', '*', '*', '*']