<img src="http://i67.tinypic.com/2jcbwcw.png" align="left"></img><br><br><br><br>


## Breakout Lecture 8: Web scraping & crawling

**Author List**: Alexander Fred Ojala

**Original Sources**: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

**License**: Feel free to do whatever you want to with this code

# Note: Run this Notebook in your Python 3 Virtual Environment

### Setup and load packages

In [None]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:

from __future__ import division, print_function

In [None]:
import bs4 as bs # Beautiful Soup is a Python library for pulling data out of HTML and XML files.
import urllib.request # The urllib.request module defines functions and classes which help in opening URLs (mostly HTTP)

## Scrape the Data-X Syllabus site

In [None]:
# Always use requests or urllib to open a URL and read in the source code before
# using Beautifulsoup to extract the information you want.

source = urllib.request.urlopen('https://data-x.blog/syllabus-data-x/').read() # open the URL and read in the content as a string of HTML code
print(source) # this is the source code for the site, looks very unstructured right now

In [None]:
# Create your Beautifulsoup object

soup = bs.BeautifulSoup(source,features='lxml') # interact with this object, looks like source in browser.

# source should be a string of HTML code
# features = 'lxml' is the parser. Sometimes it might be better to use 'html.parser'

In [None]:
# look at the printed source above. Title of the website is <title>Syllabus: Data-X &#8211; Data-X</title>. 
# Let's try to parse that with beautifulsoup

print(soup.title)

In [None]:
# or

print(soup.find('title'))

In [None]:
print(soup.title.name) # name of the tag for soup.title (should be 'title' of course)

In [None]:
# .string on a Tag type object returns a NavigableString type object.

print(soup.title.string)

In [None]:
# On the other hand, .text gets all the child strings and return concatenated using the given separator. 
# Return type of .text is unicode object.

print(soup.title.text)

### Example when there is a difference in child strings

In [None]:
print(soup.tr)

In [None]:
print(soup.tr.string) # returns None, because no string type object in between <tr>

In [None]:
print(soup.tr.text) # returns all child strings within the <td> tags as well

In [None]:
print(soup.p) #finds first p tag in the HTML code

In [None]:
soup.find_all('p') #finds all p tags

In [None]:
soup.find_all('p').text # cannot use method on the full ResultSet, place in a loop instead

In [None]:

# remove tag
# can remove with regex, but here we use beautifulsoups built in functionality
for paragraph in soup.find_all('p'):
    print(paragraph.text) # returns regular unicode .string returns navigable text
    # child tag removes if we have span etc
    
# as we can see the table is not nicely structured

In [None]:
print(soup.get_text()) # Get all child strings, concatenated, will print all javascript, wordpress coding text in a nice fashion

In [None]:
# Find first url

first_url = soup.find('a')
print('first_url:', first_url,'\n')

print('Type:',type(first_url))
print('Text: ',first_url.text)
print('Attributes:',first_url.attrs)

In [None]:
for url in soup.find_all('a'):
    print(url.get('href')) # get method get specific tag

In [None]:
# imagine we only want to extract http links and write them to a file called data-x-urls.txt on a separate line 

# find all url links at the page
links = list()
for url in soup.find_all('a'):
    link = url.get('href')
    if 'http' in link:
        print(link)
        links.append(link+'\n')

# create/open a txt file with write, will overwrite if there is a file called data-x-urls
with open('data-x-urls.txt', 'w') as file: 
    file.writelines(links) #

In [None]:
# Only find URL's in the navigation bar (tag nav)
nav=soup.nav
nav

In [None]:
type(nav)

In [None]:
for url in nav.find_all('a'):
    print(url.get('href')) # only links in navigation bar

In [None]:
body = soup.body #get content within the <body> tag of the HTML code

In [None]:
# Print all body text
for paragraph in body.find_all('p'):
    print(paragraph.text) #might be two body tags. 
    # Just text from the body
    # Scraping for content

In [None]:
# Find all text within div sections, also child tags - or specific div section
for div in soup.find_all('div'):
    print(div.text) # a lot

In [None]:
# prints both mobile and html version

for div in soup.find_all('div', class_='site-content'):
    print(div.text)

In [None]:
## Only get tables, scraping tables and xml documents

table = soup.table
table = soup.find('table')

In [None]:
table # shows the html code of the table

In [None]:
table_rows = table.find_all('tr') #table.tr or table.find('tr') would only find one

In [None]:
for tr in table_rows:
    td = tr.find_all('td') # find all table data
    row = [i.text for i in td]
    print(row) # get all the table data

In [None]:
# pandas version of grabbing tables, better

import pandas as pd

# requires html5lib: 
#!conda install --yes html5lib
dfs = pd.read_html('https://data-x.blog/syllabus-data-x/',header=0)
# header = 0, indicates that first row is header
# find all tables and parse them to several data frames



In [None]:
print(type(dfs))
print(len(dfs))
df = dfs[0]

In [None]:
# Looks great, but we might want the dates to be the indices and in datetimeformat
df.head()

In [None]:
df.iloc[:,0] # dates are stored in the first column

In [None]:
dates = list() # list of better formatted dates
for date in df.iloc[:,0]:
    d = date.split()[1:] # split date strings and only extract Month plus Day, exclude lecture number
    d = '2017 ' + ' '.join(d)
    dates.append(d)

In [None]:
dates

In [None]:
df.index=pd.to_datetime(dates) # convert dates to datetime objects and set them as the index
df.index.name='Date' #rename the index column to be "Date"

In [None]:
df.drop('Lec #',axis=1,inplace=True) # Drop the first column, with the old dates

In [None]:
df.head()

In [None]:
pd.set_option('display.max_colwidth', -1) # to not get ... in the results

df.head()

In [None]:
df.to_html('data-x-sched.html')

In [None]:
pd.options.display.max_colwidth=50 #change back to default max col_width

# Scrape images

In [None]:
print(soup.find_all('img'))

In [None]:
os.path.basename?

In [None]:
import os
import urllib

for link in soup.find_all('img'):
    img_url=link.get('src')
    
    if 'jpg' in img_url: #only check for jpg images
        print(img_url)
        print(os.path.splitext(os.path.basename(img_url))) # returns final component of pathname and extension as a tuple
        filename = os.path.splitext(os.path.basename(img_url))[0] + '.jpg'
        urllib.request.urlretrieve(img_url,filename) # urllib requests a file and then writes it to disk
    else:
        print('EXCLUDED:',img_url)

In [None]:
# XML documents - site maps, all the urls. just between tags
# XML human and machine readable.
# Newest links: all the links for FIND SITE MAP!
# News websites will have sitemaps for politics, bot constantly
# tracking news track the sitemaps

In [None]:
source = urllib.request.urlopen('https://data-x.blog/sitemap.xml').read()
soup = bs.BeautifulSoup(source,'xml') # interact with this object, looks like source in brower

In [None]:
soup.find_all('loc')

# Scrape Bloomberg for news

In [None]:
source = urllib.request.urlopen('https://www.bloomberg.com/feeds/bpol/sitemap_news.xml').read()
soup = bs.BeautifulSoup(source,'xml')

In [None]:
soup.prettify

In [None]:
for news in soup.find_all({'news'}):
    print(news.title.text)
    print(news.publication_date.text)
    print(news.keywords.text)
    print('\n')

In [None]:
# example from https://www.ayima.com/guides/how-to-visualize-an-xml-sitemap-using-python.html

# Visualize XML sitemap with categories!
import requests
from bs4 import BeautifulSoup

url = 'https://www.sportchek.ca/sitemap.xml'
url = 'https://www.bloomberg.com/feeds/bpol/sitemap_index.xml'
page = requests.get(url)
print('Loaded page with: %s' % page)

sitemap_index = BeautifulSoup(page.content, 'html.parser')
print('Created %s object' % type(sitemap_index))

In [None]:
urls = [element.text for element in sitemap_index.findAll('loc')]
print(urls)

In [None]:
def extract_links(url):
    ''' Open an XML sitemap and find content wrapped in loc tags. '''

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = [element.text for element in soup.findAll('loc')]

    return links

sitemap_urls = []
for url in urls:
    links = extract_links(url)
    sitemap_urls += links

print('Found {:,} URLs in the sitemap'.format(len(sitemap_urls)))

In [None]:
with open('sitemap_urls.dat', 'w') as f:
    for url in sitemap_urls:
        f.write(url + '\n')

In [None]:
'''
Categorize a list of URLs by site path.
The file containing the URLs should exist in the working directory and be
named sitemap_urls.dat. It should contain one URL per line.
Categorization depth can be specified by executing a call like this in the
terminal (where we set the granularity depth level to 5):
    python categorize_urls.py --depth 5
The same result can be achieved by setting the categorization_depth variable
manually at the head of this file and running the script with:
    python categorize_urls.py
'''
from __future__ import print_function


categorization_depth=3



# Main script functions


def peel_layers(urls, layers=3):
    ''' Builds a dataframe containing all unique page identifiers up
    to a specified depth and counts the number of sub-pages for each.
    Prints results to a CSV file.
    urls : list
        List of page URLs.
    layers : int
        Depth of automated URL search. Large values for this parameter
        may cause long runtimes depending on the number of URLs.
    '''

    # Store results in a dataframe
    sitemap_layers = pd.DataFrame()

    # Get base levels
    bases = pd.Series([url.split('//')[-1].split('/')[0] for url in urls])
    sitemap_layers[0] = bases

    # Get specified number of layers
    for layer in range(1, layers+1):

        page_layer = []
        for url, base in zip(urls, bases):
            try:
                page_layer.append(url.split(base)[-1].split('/')[layer])
            except:
                # There is nothing that deep!
                page_layer.append('')

        sitemap_layers[layer] = page_layer

    # Count and drop duplicate rows + sort
    sitemap_layers = sitemap_layers.groupby(list(range(0, layers+1)))[0].count()\
                     .rename('counts').reset_index()\
                     .sort_values('counts', ascending=False)\
                     .sort_values(list(range(0, layers)), ascending=True)\
                     .reset_index(drop=True)

    # Convert column names to string types and export
    sitemap_layers.columns = [str(col) for col in sitemap_layers.columns]
    sitemap_layers.to_csv('sitemap_layers.csv', index=False)

    # Return the dataframe
    return sitemap_layers




sitemap_urls = open('sitemap_urls.dat', 'r').read().splitlines()
print('Loaded {:,} URLs'.format(len(sitemap_urls)))

print('Categorizing up to a depth of %d' % categorization_depth)
sitemap_layers = peel_layers(urls=sitemap_urls,
                             layers=categorization_depth)
print('Printed {:,} rows of data to sitemap_layers.csv'.format(len(sitemap_layers)))


In [None]:
'''
Visualize a list of URLs by site path.
This script reads in the sitemap_layers.csv file created by the
categorize_urls.py script and builds a graph visualization using Graphviz.
Graph depth can be specified by executing a call like this in the
terminal:
    python visualize_urls.py --depth 4 --limit 10 --title "My Sitemap" --style "dark" --size "40"
The same result can be achieved by setting the variables manually at the head
of this file and running the script with:
    python visualize_urls.py
'''
from __future__ import print_function


# Set global variables

graph_depth = 3  # Number of layers deep to plot categorization
limit = 3       # Maximum number of nodes for a branch
title = ''       # Graph title
style = 'light'  # Graph style, can be "light" or "dark"
size = '8,5'     # Size of rendered PDF graph


# Import external library dependencies

import pandas as pd
import graphviz



# Main script functions

def make_sitemap_graph(df, layers=3, limit=50, size='8,5'):
    ''' Make a sitemap graph up to a specified layer depth.
    sitemap_layers : DataFrame
        The dataframe created by the peel_layers function
        containing sitemap information.
    layers : int
        Maximum depth to plot.
    limit : int
        The maximum number node edge connections. Good to set this
        low for visualizing deep into site maps.
    '''


    # Check to make sure we are not trying to plot too many layers
    if layers > len(df) - 1:
        layers = len(df)-1
        print('There are only %d layers available to plot, setting layers=%d'
              % (layers, layers))


    # Initialize graph
    f = graphviz.Digraph('sitemap', filename='sitemap_graph_%d_layer' % layers)
    f.body.extend(['rankdir=LR', 'size="%s"' % size])


    def add_branch(f, names, vals, limit, connect_to=''):
        ''' Adds a set of nodes and edges to nodes on the previous layer. '''

        # Get the currently existing node names
        node_names = [item.split('"')[1] for item in f.body if 'label' in item]

        # Only add a new branch it it will connect to a previously created node
        if connect_to:
            if connect_to in node_names:
                for name, val in list(zip(names, vals))[:limit]:
                    f.node(name='%s-%s' % (connect_to, name), label=name)
                    f.edge(connect_to, '%s-%s' % (connect_to, name), label='{:,}'.format(val))


    f.attr('node', shape='rectangle') # Plot nodes as rectangles

    # Add the first layer of nodes
    for name, counts in df.groupby(['0'])['counts'].sum().reset_index()\
                          .sort_values(['counts'], ascending=False).values:
        f.node(name=name, label='{} ({:,})'.format(name, counts))

    if layers == 0:
        return f

    f.attr('node', shape='oval') # Plot nodes as ovals
    f.graph_attr.update()

    # Loop over each layer adding nodes and edges to prior nodes
    for i in range(1, layers+1):
        cols = [str(i_) for i_ in range(i)]
        nodes = df[cols].drop_duplicates().values
        for j, k in enumerate(nodes):

            # Compute the mask to select correct data
            mask = True
            for j_, ki in enumerate(k):
                mask &= df[str(j_)] == ki

            # Select the data then count branch size, sort, and truncate
            data = df[mask].groupby([str(i)])['counts'].sum()\
                    .reset_index().sort_values(['counts'], ascending=False)

            # Add to the graph
            add_branch(f,
                       names=data[str(i)].values,
                       vals=data['counts'].values,
                       limit=limit,
                       connect_to='-'.join(['%s']*i) % tuple(k))

            print(('Built graph up to node %d / %d in layer %d' % (j, len(nodes), i))\
                    .ljust(50), end='\r')

    return f


def apply_style(f, style, title=''):
    ''' Apply the style and add a title if desired. More styling options are
    documented here: http://www.graphviz.org/doc/info/attrs.html#d:style
    f : graphviz.dot.Digraph
        The graph object as created by graphviz.
    style : str
        Available styles: 'light', 'dark'
    title : str
        Optional title placed at the bottom of the graph.
    '''

    dark_style = {
        'graph': {
            'label': title,
            'bgcolor': '#3a3a3a',
            'fontname': 'Helvetica',
            'fontsize': '18',
            'fontcolor': 'white',
        },
        'nodes': {
            'style': 'filled',
            'color': 'white',
            'fillcolor': 'black',
            'fontname': 'Helvetica',
            'fontsize': '14',
            'fontcolor': 'white',
        },
        'edges': {
            'color': 'white',
            'arrowhead': 'open',
            'fontname': 'Helvetica',
            'fontsize': '12',
            'fontcolor': 'white',
        }
    }

    light_style = {
        'graph': {
            'label': title,
            'fontname': 'Helvetica',
            'fontsize': '18',
            'fontcolor': 'black',
        },
        'nodes': {
            'style': 'filled',
            'color': 'black',
            'fillcolor': '#dbdddd',
            'fontname': 'Helvetica',
            'fontsize': '14',
            'fontcolor': 'black',
        },
        'edges': {
            'color': 'black',
            'arrowhead': 'open',
            'fontname': 'Helvetica',
            'fontsize': '12',
            'fontcolor': 'black',
        }
    }

    if style == 'light':
        apply_style = light_style

    elif style == 'dark':
        apply_style = dark_style

    f.graph_attr = apply_style['graph']
    f.node_attr = apply_style['nodes']
    f.edge_attr = apply_style['edges']

    return f




# Read in categorized data
sitemap_layers = pd.read_csv('sitemap_layers.csv', dtype=str)
# Convert numerical column to integer
sitemap_layers.counts = sitemap_layers.counts.apply(int)
print('Loaded {:,} rows of categorized data from sitemap_layers.csv'\
        .format(len(sitemap_layers)))

print('Building %d layer deep sitemap graph' % graph_depth)
f = make_sitemap_graph(sitemap_layers, layers=graph_depth,
                       limit=limit, size=size)
f = apply_style(f, style=style, title=title)

f.render(cleanup=True)
print('Exported graph to sitemap_graph_%d_layer.pdf' % graph_depth)


