# 1.4 - External Data Collection - web-scraping interest ontology from wikipedia.org

In [2]:
import bs4, requests, wikipediaapi
from bs4 import BeautifulSoup
import pandas as pd

## 1. Download Outline - Academic Disciplines

- #### 1.1 Store in flat table, STOP after 5th Level

    *Hack Approach*:
    - Pre-set levels (1-Category (h2), 2-Sub-category (h3), 3-Topic, 4-Sub-Topic, 5-Sub-Topic, Page)
    - One for each page (including headers)
    - If (ul, li) go deeper
    

- #### 1.2 Clean unnecessary cols

In [3]:
#Digest Wikipedia Academic Disciplines Outline Script ####################################################################################################

#HELPER FX  #####################################################################
def get_header_name_page(el, blacklist=[]):    
    name = ''
    page_href = ''
    
    span = el.find('span', class_='mw-headline')
    if span != None and span.text not in blacklist: #skip non-category headers
            name = span.text
            a = span.find('a')
            if a != None:
                page_href = a['href']
            else:
            # grab article ref from matching main article
                candidate_articles = span.find_next('div').find_all('a')
                page_href = [article['href'] for article in candidate_articles]
    return name, page_href

def get_page_href(li):
    if len(li.find_all('a', recursive=False)) > 1:
        page_href = [article['href'] for article in li.find_all('a', recursive=False)]
    else:
        page_href = li.find('a')['href']
    
    return page_href
        
def get_page_text(li):
    if len(li.find_all('a', recursive=False)) > 1: # and li.find('ul') == None
        text = li.text.replace('(outline)', '').split('\n')[0]
    else:
        text = li.find('a').text
    
    return text

#################################################################################

wikiPagePaths = []

#0. Get outline
url = "https://en.wikipedia.org/wiki/Outline_of_academic_disciplines"
r = requests.get(url)
soup = BeautifulSoup(r.content)

#1 - start unpacking at first category (h2)
start_of_outline = soup.select("h2 span", class_='mw-headline')[0].parent
outline = [start_of_outline]
outline.extend(list(start_of_outline.next_siblings))

category_h2_1 = sub_category_h3_2 = topic_3 = subtopic_4 = subtopic_5 = ''

print('####Downloading Outline of Academic Disciplines####','\n Key = 1-Category (h2), 2-Sub-category (h3), 3-Topic, 4-Sub-Topic, 5-Sub-Topic, Page \n' )

#2 - grab key outline elements (h2, h3, h4, and bulleted subtopics)
for el in outline:
    #2.1 - grab header elements (h2, h3, h4)
    
    if el.name == 'h2':
        #skip blacklisted elements
        if el.find('span', class_='mw-headline').text in ['See also', 'References', 'External links']:
            continue

        category_h2_1, page_href = get_header_name_page(el)        #get category metadata
        sub_category_h3_2 = topic_3 = subtopic_4 = subtopic_5 = '' #clear categories previous metadata
        
        print(category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href)
        wikiPagePaths.append([category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href])
        continue
            
    elif el.name == 'h3':

        sub_category_h3_2, page_href = get_header_name_page(el) #get subcategory metadata
        topic_3 = subtopic_4 = subtopic_5 = ''                  #clear categories previous metadata
        
        print(category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href)
        wikiPagePaths.append([category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href])
        continue
        
    elif el.name == 'h4':

        topic_3, page_href = get_header_name_page(el) #get subcategory metadata
        subtopic_4 = subtopic_5 = ''                  #clear categories previous metadata
        
        print(category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href)
        wikiPagePaths.append([category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href])
        continue
        
    #2.2 - grab nested bulleted lists of topics and subtopics
    elif el.name == 'div' and 'div-col' in el.get('class'):
        ul = el.find('ul')
        for li in ul:
            if isinstance(li, bs4.element.Tag):
#                 subtopic_4 = li.find('a').text
                subtopic_4 = get_page_text(li)
#                 print(subtopic_4)
                subtopic_5 = ''                  #clear categories previous metadata

#                 page_href = li.find('a')['href']
                page_href = get_page_href(li)
                print(category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href)
                wikiPagePaths.append([category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href])
            
            
            ul = li.find('ul')
            if ul != -1 and ul != None:
                for li in ul:
                    if isinstance(li, bs4.element.Tag):
#                         subtopic_5 = li.find('a').text
                        subtopic_5 = get_page_text(li)
                                
#                         page_href = li.find('a')['href']
                        page_href = get_page_href(li)
                        print(category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href)
                        wikiPagePaths.append([category_h2_1, sub_category_h3_2, topic_3, subtopic_4, subtopic_5, page_href])
    else:
        continue

wikiPagePathsDF = pd.DataFrame(wikiPagePaths, columns=['category_h2_1', 'sub_category_h3_2', 'topic_3', 'subtopic_4', 'subtopic_5', 'page_href'])

####Downloading Outline of Academic Disciplines#### 
 Key = 1-Category (h2), 2-Sub-category (h3), 3-Topic, 4-Sub-Topic, 5-Sub-Topic, Page 

Humanities     /wiki/Humanities
Humanities Arts    /wiki/The_arts
Humanities Arts Performing arts   /wiki/Performing_arts
Humanities Arts Performing arts Music   ['/wiki/Music', '/wiki/Outline_of_music']
Humanities Arts Performing arts Music  Accompanying /wiki/Accompaniment
Humanities Arts Performing arts Music  Chamber music /wiki/Chamber_music
Humanities Arts Performing arts Music  Church music /wiki/Church_music
Humanities Arts Performing arts Music  Conducting /wiki/Conducting
Humanities Arts Performing arts Music  Early music /wiki/Early_music
Humanities Arts Performing arts Music  Jazz studies  ['/wiki/Jazz', '/wiki/Outline_of_jazz']
Humanities Arts Performing arts Music  Musical composition /wiki/Musical_composition
Humanities Arts Performing arts Music  Music education /wiki/Music_education
Humanities Arts Performing arts Music  Music hist

In [4]:
#Inspect output
wikiPagePathsDF.tail()

Unnamed: 0,category_h2_1,sub_category_h3_2,topic_3,subtopic_4,subtopic_5,page_href
980,Applied Sciences,Medicine and health,,Surgery,Traumatology,/wiki/Traumatology
981,Applied Sciences,Medicine and health,,Traditional medicine,,/wiki/Traditional_medicine
982,Applied Sciences,Medicine and health,,Urology,,/wiki/Urology
983,Applied Sciences,Medicine and health,,Urology,Andrology,/wiki/Andrology
984,Applied Sciences,Medicine and health,,Veterinary medicine,,/wiki/Veterinary_medicine


## 2. Crawl Outline and Download Wiki articles

- ### 2.1 Download article text from outline

In [5]:
#2.1.0 - Setup scraper through API
wiki_wiki = wikipediaapi.Wikipedia(
            language='en',
            extract_format=wikipediaapi.ExtractFormat.WIKI
    )

print('####Downloading Article Text ####')

# For article in wikiPagePathDF
for article in wikiPagePathsDF.itertuples():
    print(article.page_href)
    
    #2.1.1 Request page   
    
    #unpack lists of pages
    if isinstance(article.page_href, list):
        text = ""
        for page in article.page_href:
            p_wiki = wiki_wiki.page(page.replace('/wiki/', ''))
            text += p_wiki.text
            print(p_wiki.text[0:100])
    else:
        p_wiki = wiki_wiki.page(article.page_href.replace('/wiki/', ''))
        text = p_wiki.text
        print(p_wiki.text[0:100])

    #2.1.2 add to col in wikiPagePathDF - keep everything, pre-process in eda stage
    wikiPagePathsDF.loc[article.Index, 'page_text'] = text

####Downloading Article Text ####
/wiki/Humanities
Humanities are academic disciplines that study aspects of human society and culture. In the Renaissa
/wiki/The_arts
The arts refers to the theory and physical expression of creativity found in human societies and cul
/wiki/Performing_arts
Performing arts are a form of art in which artists use their voices, bodies or inanimate objects to 
['/wiki/Music', '/wiki/Outline_of_music']
Music is an art form and cultural activity whose medium is sound organized in time. General definiti
The following outline is provided as an overview of and topical guide to music:
Music – human expres
/wiki/Accompaniment
Accompaniment is the musical part which provides the rhythmic and/or harmonic support for the melody
/wiki/Chamber_music
Chamber music is a form of classical music that is composed for a small group of instruments—traditi
/wiki/Church_music
Church music is music written for performance in church, or any musical setting of ecclesiastical li
/wi

## 3. Write wikiPagePathsDF to CSV

In [6]:
wikiPagePathsDF.tail()

Unnamed: 0,category_h2_1,sub_category_h3_2,topic_3,subtopic_4,subtopic_5,page_href,page_text
980,Applied Sciences,Medicine and health,,Surgery,Traumatology,/wiki/Traumatology,"In medicine, traumatology (from Greek trauma, ..."
981,Applied Sciences,Medicine and health,,Traditional medicine,,/wiki/Traditional_medicine,Traditional medicine (also known as indigenous...
982,Applied Sciences,Medicine and health,,Urology,,/wiki/Urology,"Urology (from Greek οὖρον ouron ""urine"" and -λ..."
983,Applied Sciences,Medicine and health,,Urology,Andrology,/wiki/Andrology,"Andrology (from Ancient Greek: ἀνήρ, anēr, gen..."
984,Applied Sciences,Medicine and health,,Veterinary medicine,,/wiki/Veterinary_medicine,Veterinary medicine is the branch of medicine ...


In [7]:
#Fix no topic 3 issue
tempDF = pd.DataFrame(columns=wikiPagePathsDF.columns)
for idx, row in wikiPagePathsDF.iterrows():
    if row['topic_3'] == '':
        temp_row = [row['category_h2_1'], row['sub_category_h3_2'], row['subtopic_4'], row['subtopic_5'], '', row['page_href'], row['page_text']]
    else:
        temp_row = row
    tempDF.loc[idx] = temp_row

wikiPagePathsDF = tempDF
wikiPagePathsDF[wikiPagePathsDF.columns] = wikiPagePathsDF.apply(lambda x: x.str.strip())

In [8]:
wikiPagePathsDF.to_csv('../../data/raw/articles_wikipedia/academic_outline_wikipedia_pages.csv', index=False, na_rep='')