##### EPSY 5122: Programming Fundamentals for Social Science Researchers
## Fall 2022 Week 13
### Web Scraping with Python

In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
# this just loads all fxns and attaches their names to our enviro
# in other words, no need to use the package name

In [2]:
# some functions adapted from RealPython demo:
# https://realpython.com/python-web-scraping-practical-introduction/

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try: # make an attempt to do this, but if there's an ERROR, jump to the "except"
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp): # calls function defined below
                return resp.content
            else:
                return None
    except RequestException as e: # only runs if error occurs above
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 # remember that 200 means all OK!
            and content_type is not None # makes sure there's *something* in content
            and content_type.find('html') > -1)


def log_error(e): # just in case error above, prints it
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
raw = simple_get("https://research.umn.edu/inquiry/posts")

In [4]:
raw

b'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->\n<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->\n<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->\n<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html  lang="en" dir="ltr" prefix="fb: http://ogp.me/ns/fb# og: http://ogp.me/ns#"><!--<![endif]-->\n\n<head>\n  <meta charset="utf-8" />\n<link rel="shortcut icon" href="https://research.umn.edu/sites/research.umn.edu/themes/umn_ovpr_subtheme/favicon.ico" type="image/vnd.microsoft.icon" />\n<meta name="generator" content="Drupal 7 (https://www.drupal.org)" />\n<link rel="canonical" href="https://research.umn.edu/inquiry/posts" />\n<link rel="shortlink" href="https://research.umn.edu/inquiry/posts" />\n<meta property="og:site_name" content="Office of the Vice President for Research" />\n<meta property="og:type

In [5]:
type(raw)

bytes

In [7]:
proc = BeautifulSoup(raw, "html.parser")
proc

<!DOCTYPE html>

<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->
<!--[if lte IE 6]><html class="lt-ie9 lt-ie8 lt-ie7"  lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="lt-ie9 lt-ie8"  lang="en" dir="ltr"><![endif]-->
<!--[if IE 8]><html class="lt-ie9"  lang="en" dir="ltr"><![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)]><!--><html dir="ltr" lang="en" prefix="fb: http://ogp.me/ns/fb# og: http://ogp.me/ns#"><!--<![endif]-->
<head>
<meta charset="utf-8"/>
<link href="https://research.umn.edu/sites/research.umn.edu/themes/umn_ovpr_subtheme/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<meta content="Drupal 7 (https://www.drupal.org)" name="generator"/>
<link href="https://research.umn.edu/inquiry/posts" rel="canonical"/>
<link href="https://research.umn.edu/inquiry/posts" rel="shortlink"/>
<meta content="Office of the Vice President for Research" property="og:site_name"/>
<meta content="article" property="og:type"/>
<me

In [8]:
type(proc)

bs4.BeautifulSoup

In [9]:
proc.select("div")

[<div class="admin-page-tools">
 </div>,
 <div class="printer"><div class="left"></div><div class="right">University of Minnesota<!-- br />http://twin-cities.umn.edu/<br />612-625-5000 --></div></div>,
 <div class="left"></div>,
 <div class="right">University of Minnesota<!-- br />http://twin-cities.umn.edu/<br />612-625-5000 --></div>,
 <div class="umnhf" id="umnhf-h-mast">
 <a class="umnhf" href="http://twin-cities.umn.edu/" id="umnhf-h-logo"><span>Go to the U of M home page</span></a>
 <ul class="umnhf" id="umnhf-h-ql">
 <li><a href="http://onestop.umn.edu/">One Stop</a></li>
 <li class="umnhf"><a href="https://www.myu.umn.edu/">MyU <span></span></a></li>
 </ul>
 <!-- Button below is for dropdown toggle, only visible on mobile screens. If using
                         a non-dropdown version you can delete this tag -->
 <button class="umnhf" id="umnhf-m-search">Search</button>
 <form action="//search.umn.edu/tc/" class="umnhf" id="umnhf-h-search" method="get" role="search" title="Se

In [10]:
from lxml import html