In [1]:
import wikipedia
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd

In [2]:
# This block of code builds my dataset from articles listed as "Vital Articles" on Wikipedia.
# I chose this as my dataset because I can get both a long block of text and an official summary for building 
#     a Natural Language Processor focused on long text Summarization.


# To ensure I am scraping my data from high quality wikipedia pages, 
#     I am scraping the topics from the Vital Articles wiki page.
def topicfinder():
    data = requests.get("https://en.wikipedia.org/wiki/Wikipedia:Vital_articles")
    soup = BeautifulSoup(data.text,'html.parser')

    content = soup.find_all(['tbody'])
    
    topics=[]
    for subject in content:
        for rawtop in subject.find_all('li'):
            topic = rawtop.text
            topic = re.sub(r'^[ \t]+','',topic)
            topics.append(topic)
            
    # Unfortunately, the now constructed list of topics is far from perfect, and needs futher tedious cleaning.
    # So, this is all to clean up the messy topic list.
    for i in range(len(topics)):
        topics[i] = re.sub(r'\([^)]*\)', '', topics[i])
        if re.search('\n', topics[i])!=None:
            topics[i] = topics[i].split('\n ')
    for i in range(len(topics)):
        if type(topics[i])==list:
            popper = topics.pop(i)
            for i in range(len(popper)):
                topics.append(popper.pop())
    for i in range(len(topics)):
        if type(topics[i])!=list:
            topics[i] = re.sub(r'\([^)]*\)', '', topics[i])
            if re.search('\n', topics[i])!=None:
                topics[i] = topics[i].split('\n ')
    for i in range(len(topics)):
        if type(topics[i])==list:
            popper = topics.pop(i)
            for i in range(len(popper)):
                topics.append(popper.pop())
    for i in range(len(topics)):
        if type(topics[i])!=list:
            topics[i] = re.sub(r'\([^)]*\)', '', topics[i])
    
    # It's not the cleanest... dropping anything remaing that isn't a string
    def listDeleter(topics):
        x = 0
        for i in topics:
            if type(i)!=str:
                x+=1
        while x>0:
            for i in topics:
                if type(i)!=str:
                    topics.remove(i)
                    x-=1
    listDeleter(topics)
    
    # Return the nice and neat list of topics.
    return topics


# This function takes a single topic, scrapes the associated Wikipedia page, 
#      and returns a list of the headings and associated paragraps in order.
def scrapeAndClean(topic):
    
    # Turn the topic into a url for scraping
    choice = re.sub(r' ','_',topic)
    url = 'https://en.wikipedia.org/wiki/'+choice

    # Fetch and scrape url
    data = requests.get(url)
    soup = BeautifulSoup(data.text,'html.parser')

    # Extract the text content from paragraphs and headers
    textcontainer=[]
    for head in soup.find_all('span', attrs={'mw-headline'}):
        head.name = 'titular'
    content = soup.find_all(['titular','p'])
    
    # Interlay the paragraphs and titles in order
    for paragraph in content:
        if paragraph.name != 'p':
            textblock = [paragraph.text]
            textcontainer.append(textblock)   
        else:
            textcontainer.append(paragraph.text)
            
    # Clean up the text in the list of soup, return list of paragraphs
    paras = []
    txt =""""""
    for textblock in textcontainer:
        if type(textblock)==str:
            textblock = re.sub(r'\[.*?\]', '', textblock)
            textblock = re.sub(r'\n',' ',textblock)
            txt = txt+textblock
        else:
            paras.append(txt)
            txt=""""""
            paras.append(textblock)
    return paras


# Smooth the list of paragraphs into a single text block.
# I want to be able to analyze the text in ordered chunks and as a whole text blob.
def toText(paras):
    text = ''''''
    for i in paras:
        if type(i) == str:
            text = text+i
        else:
            text = text + i[:][0]+'.'
    return text


# This function excecutes the above functions, and builds a dataframe accordingly.
# Topic column holds the input topic.
# TextSmooth column holds the scraped text as a single string.
# TextBroken column holds the scraped text as a list of paragraphs.
# WikiSummary column holds the official, fetched, wikipedia summary of the page -- this will be my Y variable.
def dfBuilder(topics):
    df = pd.DataFrame(columns=['Topic','TextSmooth','TextBroken','WikiSummary'])
    for topic in topics:
        try:
            wikipedia.summary(topic, auto_suggest=False)
        except:
            continue
        else:
            summary = scrapeAndClean(topic)
            dfrow = pd.DataFrame({'Topic': [topic],
                                  'TextSmooth': [toText(summary)],
                                  'TextBroken': [summary],
                                  'WikiSummary': [wikipedia.summary(topic, auto_suggest=False)]
                                 })
            df = pd.concat([pd.DataFrame(dfrow),df.loc[:]], ignore_index=True)
    return df

In [3]:
# Run the topicfinder function and save the resulting list in the 'topics' variable.
topics=topicfinder()

In [4]:
# Run the dfBuilder and save the returned dataframe as df.
df = dfBuilder(topics)



  lis = BeautifulSoup(html).find_all('li')


In [7]:
df.to_csv('wikiSumDF.csv', index=False)

In [6]:
df.head(10)

Unnamed: 0,Topic,TextSmooth,TextBroken,WikiSummary
0,Angle,"In Euclidean geometry, an angle is the figure ...","[In Euclidean geometry, an angle is the figure...","In Euclidean geometry, an angle is the figure ..."
1,Trigonometry,Trigonometry (from Ancient Greek τρίγωνον (t...,[ Trigonometry (from Ancient Greek τρίγωνον (...,Trigonometry (from Ancient Greek τρίγωνον (tr...
2,Real number,"In mathematics, a real number is a number that...","[In mathematics, a real number is a number tha...","In mathematics, a real number is a number that..."
3,e,"E, or e, is the fifth letter and the second v...","[ E, or e, is the fifth letter and the second ...","E, or e, is the fifth letter and the second vo..."
4,Fraction,"A fraction (from Latin: fractus, ""broken"") re...","[ A fraction (from Latin: fractus, ""broken"") r...","A fraction (from Latin: fractus, ""broken"") rep..."
5,Integer,"An integer is the number zero (0), a positive...","[ An integer is the number zero (0), a positiv...","An integer is the number zero (0), a positive ..."
6,0,0 (zero) is a number representing an empty q...,[ 0 (zero) is a number representing an empty ...,0 (zero) is a number representing an empty qua...
7,Natural number,"In mathematics, the natural numbers are those...","[ In mathematics, the natural numbers are thos...","In mathematics, the natural numbers are those ..."
8,Prime number,A prime number (or a prime) is a natural num...,[ A prime number (or a prime) is a natural nu...,A prime number (or a prime) is a natural numbe...
9,Alloy,An alloy is a mixture of chemical elements of ...,[An alloy is a mixture of chemical elements of...,An alloy is a mixture of chemical elements of ...
