## Notes for myself

In [1]:
# Our jupyter/datascience-notebook Docker container comes with 
# BeautifulSoup4 and requests, both popular libraries!
from bs4 import BeautifulSoup
import requests

In [2]:
START_URL = 'https://brickset.com/sets/year-2016'

In [38]:
# page = requests.get(START_URL)

# In order to work with web data, we’re going to want to access the text-based content of web files. 
# We can read the content of the server’s response with page.text 
# (or page.content if we would like to access the response in bytes).

In [14]:
# page.text

# In the next section, we can leverage the Beautiful Soup module to work with this textual data in a more 
# human-friendly manner.

In [91]:
# soup = BeautifulSoup(page.text, 'html.parser')

# Doing thus gives us a parse tree from this parsed page that we’ll get from running Python’s 
# built-in html.parser over the HTML.

In [None]:
# print(soup.prettify())

# Shows one tag per line and the tags are nested b/c of the tree schema used by Beautiful Soup.

## Solutions

In [3]:
# titles = []
# for i in range(0, len(chunk)):
#     if i % 2 == 0:
#         titles.append(soup.find_all('h1')[i].get_text())
# print(titles) 

In [27]:
# metas = soup.findAll("div",class_="meta")
# titles = []
# for meta in metas:
#     titles.append(meta.h1.text) 
# print(titles)

## Function

In [5]:
def get_titles(soup):  
    
    """ Returns a list of titles on the page """
    
    metas = soup.findAll("div",class_="meta")
    titles = []
    for meta in metas:
        titles.append(meta.h1.text) 
    return(titles)
        
    # the "soup" parameter is of the type that is
    # returned by Beautiful Soup when it parses HTML.
    # The function should then use the object to
    # extract a list of titles (of the lego sets)
    #
    # Lookup the documentation for Beautiful Soup
    # Figure out how to select the text of the title
    # of each legoset. A title should look like: 
    # "10252: Volkswagen Beetle"

In [21]:
def parse_bricks(url):
    """ Fetches Lego Bricks page and extracts titles """
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    titles = get_titles(soup)
    print(titles)
    return titles
    
    # Lookup the documentation to the "requests" library
    #
    # Use requests to make a get request to the
    # url given in the argument "url" (which is a string)
    # and get the raw HTML body of the response
    #
    # Use "BeautifulSoup" to parse this HTML. 
    #
    # Use the "get_titles" function to extract the
    # titles from the BeautifulSoup object.
    #
    # Return the titles

In [22]:
bricks = parse_bricks(START_URL)

['10251:  Brick Bank', '10252:  Volkswagen Beetle', '10253:  Big Ben', '10254:  Winter Holiday Train', '10654:  XL Creative Brick Box', '10702:  Creative Building Set', '10705:  Creative Building Basket', '10720:  Police Helicopter Chase', '10721:  Iron Man vs. Loki', '10722:  Snake Showdown', "10723:  Ariel's Dolphin Carriage", '10724:  Batman & Superman vs. Lex Luthor', '10725:  Lost Temple', "10726:  Stephanie's Horse Carriage", "10727:  Emma's Ice Cream Truck", "10728:  Mia's Vet Clinic", "10729:  Cinderella's Carriage", '10801:  Baby Animals', '10802:  Savanna', '10803:  Arctic', '10804:  Jungle', '10805:  Around the World', '10806:  Horses', '10807:  Horse Trailer', '10808:  Little Plane']


In [23]:
assert(bricks[0] == '10251:  Brick Bank')
assert(bricks[9] == '10722:  Snake Showdown')