## Web Scraping Example: Optimized Learning with Knowledge Graphs
- finds and orders prerequisite knowledge for a given source
- Collects linked references from given Wiki page with web scraping

In [1]:
# get libraries for html get request, bs4 for parsing web to json, 
# pandas for df's, and re for regexes (see below for more regex details)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
# choose any wiki page as the start node: let's use a relevant topic of the sort of thing 
# we're trying to emulate, get data from that URL
gkg_page = requests.get('https://en.wikipedia.org/wiki/Google_Knowledge_Graph')
# create a soup object with the html parser
gkg_soup = BeautifulSoup(gkg_page.content, 'html.parser')

![after inspecting a wiki page, I found that the tag mw-content-text is the main text area w/ links](wiki_main_scrape.png)

In [3]:
# from the soup, get the first div from the main body with id 'bodyContent'
# from bodyContent, get nested div with id 'mw-content-text'
gkg_body = gkg_soup.find('div', {'id': 'bodyContent'}).find('div', {'id': 'mw-content-text'})
# IMPORTANT FOR TASK: find all 'a' tags from this page, these are our links
gkg_fa = gkg_body.find_all('a', href=True)
# use a list comprehension (fancy one-liner loop syntax) to get the link (the 'href') from each a tag
gkg_links = [a['href'] for a in gkg_fa]
gkg_links[:15]

['/wiki/Knowledge_graph',
 '/wiki/File:Google_Knowledge_Panel.png',
 '/wiki/File:Google_Knowledge_Panel.png',
 '/wiki/Thomas_Jefferson',
 '/wiki/Google_Search',
 '/wiki/Knowledge_base',
 '/wiki/Google',
 '/wiki/Google_Search',
 '/wiki/Infobox',
 '/wiki/Search_engine_results_page',
 '#cite_note-:0-1',
 '#cite_note-2',
 '#cite_note-3',
 '#cite_note-4',
 '#cite_note-5']

In [4]:
# note that our links has a bunch of things that we don't want, like external links and others
# filter out the links that don't go to other wiki pages, match regex for wiki endpoint 
gkg_labels_all = list(filter(re.compile('^/wiki/').match, gkg_links))
# exclude files, specials, templates, etc Thing:resource
gkg_labels = list(set(gkg_labels_all) - set(filter(re.compile('.*:.*').match, gkg_labels_all)))
gkg_labels

['/wiki/Google_Get_Your_Business_Online',
 '/wiki/WebM',
 '/wiki/PageRank_algorithm_in_biochemistry',
 '/wiki/Wolfram_Alpha',
 '/wiki/Google_Web_Designer',
 '/wiki/Android_Beam',
 '/wiki/Google_The_Thinking_Factory',
 '/wiki/Area_120',
 '/wiki/Googlefight',
 '/wiki/Google%2B',
 '/wiki/Waze',
 '/wiki/Pixel_3',
 '/wiki/Google_Store',
 '/wiki/AV1',
 '/wiki/Axel_Springer_SE',
 '/wiki/Google_Analytics',
 '/wiki/Urchin_(software)',
 '/wiki/The_Verge',
 '/wiki/Tensor_Processing_Unit',
 '/wiki/Computability',
 '/wiki/Google_Catalogs',
 '/wiki/Kaggle',
 '/wiki/Android_Studio',
 '/wiki/Google_WiFi',
 '/wiki/Google_Helpouts',
 '/wiki/Nexus_4',
 '/wiki/Google_Search_Appliance',
 '/wiki/Business_Insider',
 '/wiki/DeGoogle',
 '/wiki/AngularJS',
 '/wiki/Google_Brain',
 '/wiki/Google_Developer_Day',
 '/wiki/Google_Chart_API',
 '/wiki/GYP_(software)',
 '/wiki/Neotonic_Software',
 '/wiki/Google_Fit',
 '/wiki/Pixel_4',
 '/wiki/Where_on_Google_Earth_is_Carmen_Sandiego%3F',
 '/wiki/Google_Drawings',
 '/wik

In [5]:
def get_wiki_links(partial_url, nodes_neighbors, search_depth):
    """
    Collects links from the given Wikipedia page, stores them as a list of outgoing edges for this page's node.
    TODO Recursively calls itself on each page found, decrementing search depth each round until depth is zero.
    
    Parameters:
    -----------
    partial_url (str) : string of the form /wiki/Name_of_article for the target article
    search_depth (int) : number of search rounds remaining in recursion
    nodes_neighbors (dict) : existing dictionary of {page: [list, of, pages, linked]) to which we add
    
    Returns:
    --------
    nodes_neighbors (dict) : dict including edges for this page, used for directed graph in neo4j
    
    """
    # get data from specified wiki page and use bs4 to parse it
    page = requests.get(f'https://en.wikipedia.org{partial_url}')
    soup = BeautifulSoup(page.content, 'html.parser')
    # get main body text content from nested mw-content-text div
    body = soup.find("div", {"id": "bodyContent"}).find("div", {"id": "mw-content-text"})
    # get all a tags and extract link from each
    fa = body.find_all('a', href=True)
    links = [a['href'] for a in fa]
    # filter out the links that don't go to other wiki pages, match regex for wiki endpoint 
    labels_all = list(filter(re.compile("^/wiki/").match, links))
    # exclude files, specials, templates, etc. subtracts strings of format Type:resource
    labels = list(set(labels_all) - set(filter(re.compile(".*:.*").match, labels_all)))
    # add labels to dict entry for this node
    nodes_neighbors[partial_url] = labels

    return nodes_neighbors
    

In [6]:
gkg_dict = get_wiki_links('/wiki/Agalychnis_callidryas', dict(), 1)
gkg_dict

{'/wiki/Agalychnis_callidryas': ['/wiki/ISSN_(identifier)',
  '/wiki/CITES',
  '/wiki/IUCN_Red_List',
  '/wiki/Agalychnis',
  '/wiki/S2CID_(identifier)',
  '/wiki/Carrion',
  '/wiki/Cercyonis',
  '/wiki/Colombia',
  '/wiki/Edward_Drinker_Cope',
  '/wiki/Tadpole',
  '/wiki/Carnivore',
  '/wiki/Metamorphosis',
  '/wiki/Bromelia',
  '/wiki/Arboreal',
  '/wiki/Ranoidea_chloris',
  '/wiki/Least_Concern',
  '/wiki/IUCN',
  '/wiki/Doi_(identifier)',
  '/wiki/Scale_(map)',
  '/wiki/Wikispecies',
  '/wiki/Phyllomedusinae',
  '/wiki/Rainforest',
  '/wiki/Binomial_nomenclature',
  '/wiki/INaturalist',
  '/wiki/Mexico',
  '/wiki/ISBN_(identifier)',
  '/wiki/Defensive_adaptation',
  '/wiki/Barcode_of_Life_Data_System',
  '/wiki/Central_America',
  '/wiki/Camouflage',
  '/wiki/Amplexus',
  '/wiki/Wikidata',
  '/wiki/Amphibian_Species_of_the_World',
  '/wiki/Deimatic_behaviour',
  '/wiki/Phenotypic_plasticity',
  '/wiki/Phyllomedusidae',
  '/wiki/Epiphyte',
  '/wiki/Animal',
  '/wiki/Chordate',
  '/w

## Generating a graph

In [8]:
# loop over valid wiki page links and call some function on them that collects information IFF they aren't already in the list of added 

# make the above into a function, pass a /wiki/Page_name_from_url and a number of search depth remaining
# that gets decremented for each recursive call

TODO: shape our data so that we can have a json dict of nodes and edges that is interpretable by neo4j!