In [11]:
## Code to extract example sentences from Wikipedia articles
## James Fodor 2022
## Python 3.8

import numpy as np
import pandas as pd
import csv
import nltk
import re
import wikipedia

# File path for where vocab file is stored
path_base = 'D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\'

# Get wordnet to work
from nltk.data import path # need to specify the location of the nltk data
path.append(path_base+"\Frames and Structured Data\FrameNet\\nltk_data")

### Functions for loading wiki articles

In [3]:
## Key functions

# Check to see if given wiki article can be loaded, needed to avoid crashes for loading non-existent articles
def check_wiki_article(title, printing=False):
    """ string -> bool
    Returns True if the article corresponding to the inputted title can be loaded, False if not.
    """
    try:
        article = wikipedia.page(title) # load the wiki article
        if printing==True:
            print('Loaded: '+article.title)
            print(article.content[0:100]) # show preview of article
        loaded = True
    except:
        loaded = False
    
    if loaded==False:
        try:
            alt_title = title+'s' # plural sometimes works
            article = wikipedia.page(alt_title)
            if printing==True:
                print('Loaded: '+article.title)
                print('Title used: '+alt_title)
                print(article.content[0:100]) # print a preview
            loaded = True
        except:
            loaded = False
    
    if loaded==False:
        try:
            alt_title = title+title[-1] # sometimes this works for some reason
            article = wikipedia.page(alt_title)
            if printing==True:
                print('Loaded: '+article.title)
                print('Title used: '+alt_title)
                print(article.content[0:100]) # print a preview
            loaded = True
        except:
            if printing==True: 
                print('not found')
            loaded = False
            
    return(loaded)


# Load plain text of single wiki article
def load_wiki_article(article_title):
    """ string -> list
    Loads the wikipedia article corresponding to the given title, returning its content as a list of sentences.
    """
    try:
        article = wikipedia.page(article_title) # load the wiki article
        article_sentences = split_to_list(article)
    except:
        try:
            alt_title = article_title+'s' # plural sometimes works
            article = wikipedia.page(alt_title)
            article_sentences = split_to_list(article)
        except:
            print(article_title+' not found')
            article_sentences = [] # return empty list
    finally:
        return(article_sentences)


# Split article content into list of one sentence per line
def split_to_list(article):
    """ article_object -> list
    Takes a wikipedia article object and extracts the contents as text, splitting to one sentence per line 
    and removing some irrelevant punctuation and short sentences. Returns a list of sentences.
    """
    sentences = nltk.sent_tokenize(article.content, language="english") # split article by paragraph
    sentences_final = []
    skip=False
    skip_set = ('i.e.','e.g.')
    min_sentence_len = 50 # ignore sentences with fewer characters
    i=0

    # Process by sentence
    for sentence in sentences:
        if skip==True: # skip if needed
            skip=False
            i=i+1
            continue

        else:
            l = len(sentence)
            if sentence[l-4:l] in skip_set: # if last four chars match anything in the skip set (e.g. or i.e.)
                sentence = sentence+' '+sentences[i+1] # combine with next line
                skip=True # skip the next line as we just added it on to this line

            # Processing of article text to remove metadata and irrelevant lines
            sentence = re.sub('\[.+\]', '', sentence) # remove anything in square brackets (mostly the pronunciation guide)
            sentence = re.sub('(\W);', '\\1', sentence)
            sentence = re.sub('([a-z]{2,}\.)([A-Z][a-z])', '\\1\n\\2', sentence) # split the weird sentences with .New format
            sentence = re.sub(':\s(\d|,|\s|\–){2,}', '', sentence) # remove lingering page numbers
            new_sentences = sentence.split('\n') # split multi-line paragraphs

            for new_sentence in new_sentences:
                if new_sentence=='': # remove blank lines
                    continue
                elif new_sentence[0]=='=': # remove headings
                    continue
                elif len(new_sentence)<min_sentence_len: # remove very short lines
                    continue
                elif new_sentence[-1]!='.': # must end with full stop
                    continue
                elif new_sentence.find('ISBN ')>0: # ignore lines with ISBNs
                    continue
                else:
                    new_sentence = new_sentence.replace('"','') # remove quotation marks
                    sentences_final.append(new_sentence)
            i=i+1

    print('Loaded: '+article.title+', Sentences: '+str(len(sentences_final))) # number of sentences
    return(sentences_final)


# Save list of stentences from a given article to a file
def save_sentences(sentences_list, filename, path):
    """ list_str, str, str -> None
    Saves a list of sentences to a specified filename.
    """
    save_path = path+filename+'.txt'
    save_file = open(save_path, "a", encoding='utf-8')

    for sentence in sentences_list:
        #print(sentence)
        save_file.writelines(sentence)
        save_file.write('\n')            
    save_file.close()

### Load list of articles and save the sentences to file

In [4]:
# Load list of wikipedia articles to use
titles_file = path_base+'Corpus Data\Wikipedia 10k corpus\\article_list.txt'
article_titles_pd = pd.read_table(titles_file, index_col=0, header=None, quoting=csv.QUOTE_NONE, skip_blank_lines=True)
article_titles_list = article_titles_pd.index.values
print('List of '+str(len(article_titles_list))+' articles loaded')

List of 10001 articles loaded


In [13]:
# Trial loading and printing article
article_title = 'chemistry'
if check_wiki_article(article_title):
    article_content = load_wiki_article(article_title)
else:
    print('not found')

Loaded: Chemistry, Sentences: 260


In [None]:
# Load plain text of all wiki articles from list and save sentences to file
save_path = path_base+'/Corpus Data/'
for article_title in article_titles_list[0:5]:
    sentences_list = load_wiki_article(article_title)
    save_sentences(sentences_list, 'full_corpus', save_path)