# Simple HTML Web Scraping with Beautiful Soup

## Environment setup

Libraries

 pip install BeautifulSoup4

pip install pandas

## Web scraping

In [1]:
# import libraries
from bs4 import BeautifulSoup
import urllib.request
import csv

In [3]:
# specify the url
# I will obtain data from University of Waterloo's CS courses
# prerequisite chart.
urlpage =  'https://cs.uwaterloo.ca/current-undergraduate-students/majors/prerequisite-chain-computer-science-major-courses/cs-prerequisite-chart'

In [4]:
# query the website to return the html and store
page_html = urllib.request.urlopen(urlpage)

In [5]:
# parse html with beautiful soup and store
soup = BeautifulSoup(page_html, 'html.parser')

In [6]:
# test
# If the ouput is empty or is an error, 
# further debugging is required.
soup

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book# profile: http://ogp.me/ns/profile# video: http://ogp.me/ns/video# product: http://ogp.me/ns/product# content: http://purl.org/rss/1.0/modules/content/ dc: http://purl.org/dc/terms/ foaf: http://xmlns.com/foaf/0.1/ rdfs: http://www.w3.org/2000/01/rdf-schema# sioc: http://rdfs.org/sioc/ns# sioct: http://rdfs.org/sioc/types# skos: http://www.w3.org/2004/02/skos/core# xsd: http://www.w3.org/2001/XMLSchema# schema: http://schema.org/ v: http://www.w3.org/2006/vcard/ns#"><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><script type="text/x-mathjax-config">MathJax.Hub.Config({
  extensions: ['tex2jax.js'],
  jax: ['input/TeX'],
  MMLorHTML: { prefer: { Firefox: "MML" }},
  tex2jax: {
    displayMath: [ ['\\[','\\]'] ],
    inlineMath: [ ['\\(','\\)'] ],
    processEscapes: true,
    processClass: 'page',
    ignoreClass: 'html'
  },
 

In [19]:
# find results 'table'
table = soup.find('tbody')

In [22]:
results = table.findAll('tr')
print('Number of results', len(results))

Number of results 75


In [36]:
results

[<tr id="CS100">
 <th><a id="CS100" name="CS100">CS 100</a></th>
 <td>Introduction to Computing through Applications</td>
 <td>None</td>
 <td>None</td>
 <td><a href="#CS200">CS 200</a></td>
 <td>Fall/winter/spring</td>
 <td>Yes</td>
 </tr>, <tr>
 <th><a id="CS105" name="CS105">CS 105</a></th>
 <td>Introduction to Computer Programming 1</td>
 <td>None</td>
 <td>None</td>
 <td>
 <a href="#CS106">CS 106</a>,  <a href="#CS330">CS 330</a>
 </td>
 <td>Fall</td>
 <td>Yes</td>
 </tr>, <tr>
 <th><a id="CS106" name="CS106">CS 106</a></th>
 <td>Introduction to Computer Programming 2</td>
 <td><a href="#CS105">CS 105</a></td>
 <td>None</td>
 <td><a href="#CS330">CS 330</a></td>
 <td>Winter</td>
 <td>Yes</td>
 </tr>, <tr id="CS115">
 <th><a id="CS115" name="CS115">CS 115</a></th>
 <td>Introduction to Computer Science 1</td>
 <td>None</td>
 <td>None</td>
 <td><a href="#CS116">CS 116</a></td>
 <td>Fall/winter/spring</td>
 <td>Yes</td>
 </tr>, <tr id="CS116">
 <th><a id="CS116" name="CS116">CS 116</a>

In [108]:
rows = []
rows.append(['Course', 'Title', 'Prereqs', 'Coreqs', 'Successors', 'Terms offered', 'Open to non-CS majors'])

In [109]:
# loop over results
for res in results:
    # get course name written between th tags
    # remove unwanted spaces to make course name uniform
    course = res.find('th').get_text()
    course = course.strip('\n').replace(" ", "")
    
    # test
#     print(course)
    
    # obtain other data by columns
    data = res.find_all('td')
    
    # write columns to variables
    title = data[0].getText()
    prereqs = data[1].getText()
    coreqs = data[2].getText()
    succ = data[3].getText()
    terms = data[4].getText()
    non_cs = data[5].getText()
    
    # remove newline
    prereqs = prereqs.strip('\n')
    coreqs = coreqs.strip('\n')
    succ = succ.strip('\n')
    
    # test
#     print('--------------------')
#     print(title)
#     print('--------------------')
#     print(prereqs)
#     print('--------------------')
#     print(coreqs)
#     print('--------------------')
#     print(succ)
#     print('--------------------')
#     print(terms)
#     print('--------------------')
#     print(non_cs)
#     print('====================')

    rows.append([course, title, prereqs, coreqs, succ, terms, non_cs])

print(rows)

[['Course', 'Title', 'Prereqs', 'Coreqs', 'Successors', 'Terms offered', 'Open to non-CS majors'], ['CS100', 'Introduction to Computing through Applications', 'None', 'None', 'CS 200', 'Fall/winter/spring', 'Yes'], ['CS105', 'Introduction to Computer Programming 1', 'None', 'None', 'CS 106,\xa0 CS 330', 'Fall', 'Yes'], ['CS106', 'Introduction to Computer Programming 2', 'CS 105', 'None', 'CS 330', 'Winter', 'Yes'], ['CS115', 'Introduction to Computer Science 1', 'None', 'None', 'CS 116', 'Fall/winter/spring', 'Yes'], ['CS116', 'Introduction to Computer Science 2', 'CS 115 or\xa0\xa0CS 135\xa0or CS 145', 'None', 'CS 136,\xa0 CS 230,\xa0 CS 234,\xa0 CS 330,\xa0 CS 335,\xa0 CS 371', 'Fall/winter/spring', 'Yes'], ['CS135', 'Designing Functional Programs', 'None', 'None', 'CS 116,\xa0 CS 136, CS 146', 'Fall/winter', 'Yes'], ['CS136', 'Elementary Algorithm Design and Data Abstraction', '90%+ in CS 115 or 70%+ in CS 116 or 60%+ in CS 135 or CS 145.', 'None', 'CS 230,\xa0 CS 231,\xa0 CS 234,\x

In [104]:
# create CSV and write to output file
# w : writing
with open('courses.csv','w', newline='') as f_output:
    csv_output = csv.writer(f_output)
    csv_output.writerows(rows)