**Web-Scraping based Greg Reda's tutorial:** http://www.gregreda.com/2013/03/03/web-scraping-101-with-python/

In [1]:
# Installing: lxml (library for processing XML and HTML in Python)
! pip install lxml

[33mYou are using pip version 7.1.0, however version 7.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# Loading BeautifulSoup class from bs4 library
from bs4 import BeautifulSoup
# Loading urlopen function from urllib2 library
from urllib2 import urlopen
# Loading sleep from time library
from time import sleep # Suspend execution of the current thread for the given number of seconds.
# Use this to prevent overwhelming the server between connections

In [3]:
# Website to scrape
url = 'http://www.chicagoreader.com/chicago/best-of-chicago-2011/BestOf?oid=4100483'

*Inspecting Element*
<img src='img-1.png' style='width:450px;'>
*The links that are relevant are within* <font color = 'blue'>&lt;dd&gt;</font> tags
<img src='img-dd.png' style='width:450px;'>

In [5]:
base_url = 'http://www.chicagoreader.com'

In [49]:
def get_category_links(section_url):
    html = urlopen(section_url)
    bsObj = BeautifulSoup(html)
    # Find <dl> tag with class = 'boccat', then find all <a> tags
    a_hrefs = bsObj.find('dl', 'boccat').findAll('a') 
    # Concatenate base_url with hrefs
    links = [base_url + a['href'] for a in a_hrefs]
    return links

In [47]:
# Testing
url = 'http://www.chicagoreader.com/chicago/best-of-chicago-2011-food-drink/BestOf?oid=4106228'
html = urlopen(url)
bsObj = BeautifulSoup(html)
a_hrefs = bsObj.find('dl', 'boccat').findAll('a')
[base_url + a['href'] for a in a_hrefs]

['http://www.chicagoreader.com/chicago/BestOf?category=1979894&year=2011',
 'http://www.chicagoreader.com/chicago/best-fancy-restaurant-in-chicago/BestOf?oid=4088017',
 'http://www.chicagoreader.com/chicago/best-bang-for-your-buck/BestOf?oid=4088018',
 'http://www.chicagoreader.com/chicago/best-chef/BestOf?oid=4088191',
 'http://www.chicagoreader.com/chicago/best-up-and-coming-chef/BestOf?oid=4088225',
 'http://www.chicagoreader.com/chicago/best-food-blog/BestOf?oid=4088227',
 'http://www.chicagoreader.com/chicago/best-ampersand-restaurant/BestOf?oid=4088228',
 'http://www.chicagoreader.com/chicago/best-restaurant-name/BestOf?oid=4088229',
 'http://www.chicagoreader.com/chicago/best-new-food-trend/BestOf?oid=4088231',
 'http://www.chicagoreader.com/chicago/best-cocktail-list/BestOf?oid=4088236',
 'http://www.chicagoreader.com/chicago/best-mixologist/BestOf?oid=4088461',
 'http://www.chicagoreader.com/chicago/best-wine-list/BestOf?oid=4088462',
 'http://www.chicagoreader.com/chicago/bes

In [50]:
get_category_links(url)

['http://www.chicagoreader.com/chicago/BestOf?category=1979894&year=2011',
 'http://www.chicagoreader.com/chicago/best-fancy-restaurant-in-chicago/BestOf?oid=4088017',
 'http://www.chicagoreader.com/chicago/best-bang-for-your-buck/BestOf?oid=4088018',
 'http://www.chicagoreader.com/chicago/best-chef/BestOf?oid=4088191',
 'http://www.chicagoreader.com/chicago/best-up-and-coming-chef/BestOf?oid=4088225',
 'http://www.chicagoreader.com/chicago/best-food-blog/BestOf?oid=4088227',
 'http://www.chicagoreader.com/chicago/best-ampersand-restaurant/BestOf?oid=4088228',
 'http://www.chicagoreader.com/chicago/best-restaurant-name/BestOf?oid=4088229',
 'http://www.chicagoreader.com/chicago/best-new-food-trend/BestOf?oid=4088231',
 'http://www.chicagoreader.com/chicago/best-cocktail-list/BestOf?oid=4088236',
 'http://www.chicagoreader.com/chicago/best-mixologist/BestOf?oid=4088461',
 'http://www.chicagoreader.com/chicago/best-wine-list/BestOf?oid=4088462',
 'http://www.chicagoreader.com/chicago/bes

*Best Fancy Restaurant category is in* <font color = 'blue'>&lt;h1 class="headline"&gt;</font> tag and winner and runner up in Best Fancy Restaurant category are in* <font color = 'blue'>&lt;h2 class="boc1"&gt;</font> and <font color = 'blue'>&lt;h2 class="boc2"&gt;</font> tags.
![](img-cat.png)

In [45]:
def get_category_winner(category_url):
    html = urlopen(category_url)
    bsObj = BeautifulSoup(html)
    category = bsObj.find('h1', 'headline').text
    best = [h2.text for h2 in bsObj.findAll('h2', 'boc1')] # using list comprehension because there can be multiple winners
    runners_up = [h2.text for h2 in bsObj.findAll('h2', 'boc2')] # using list comprehension because there can be multiple runners up
    return {'category': category, 
            'category_url': category_url, 
            'best': best, 
            'runners_up': runners_up}
    

In [44]:
# Testing
url = 'http://www.chicagoreader.com/chicago/best-fancy-restaurant-in-chicago/BestOf?oid=4088017'
html = urlopen(url)
bsObj = BeautifulSoup(html)
print bsObj.find('h1', 'headline').string + '---via .string'
print bsObj.find('h1', 'headline').text + '---via .text'
print '.string and .text returns same result'
print '--------------------------------------------------------'
best = [h2.text for h2 in bsObj.findAll('h2', 'boc1')]
runners_up = [h2.text for h2 in bsObj.findAll('h2', 'boc2')]
print 'Best: %s and Runners up: %s' %(best, runners_up)

Best fancy restaurant in Chicago ---via .string
Best fancy restaurant in Chicago ---via .text
.string and .text returns same result
--------------------------------------------------------
Best: [u'Alinea '] and Runners up: [u'Blackbird', u'Girl & the Goat', u'Green Zebra', u'The Publican']


In [46]:
get_category_winner(url)

{'best': [u'Alinea '],
 'category': u'Best fancy restaurant in Chicago\xa0',
 'category_url': 'http://www.chicagoreader.com/chicago/best-fancy-restaurant-in-chicago/BestOf?oid=4088017',
 'runners_up': [u'Blackbird',
  u'Girl & the Goat',
  u'Green Zebra',
  u'The Publican']}

In [80]:
# Loading json encoder and decoder
import json
# Loading BeautifulSoup class from bs4 library
from bs4 import BeautifulSoup
# Loading urlopen function from urllib2 library
from urllib2 import urlopen
# Loading sleep from time library
from time import sleep # Suspend execution of the current thread for the given 
# number of seconds. Use this to prevent overwhelming the server between connections


base_url = 'http://www.chicagoreader.com'

# Function to get all the category links
def get_category_links(section_url):
    html = urlopen(section_url)
    bsObj = BeautifulSoup(html)
    # Find <dl> tag with class = 'boccat', then find all <a> tags
    a_hrefs = bsObj.find('dl', 'boccat').findAll('a') 
    # Concatenate base_url with hrefs
    links = [base_url + a['href'] for a in a_hrefs]
    return links

# Function to get category winner and runners up
def get_category_winner(category_url):
    html = urlopen(category_url)
    bsObj = BeautifulSoup(html)
    category = bsObj.find('h1', 'headline').text
    best = [h2.text for h2 in bsObj.findAll('h2', 'boc1')] # using list comprehension because there can be multiple winners
    runners_up = [h2.text for h2 in bsObj.findAll('h2', 'boc2')] # using list comprehension because there can be multiple runners up
    return {'category': category, 
            'category_url': category_url, 
            'best': best, 
            'runners_up': runners_up}
    

music_n_nightlife = 'http://www.chicagoreader.com/chicago/best-of-chicago-2011-music-nightlife/BestOf?oid=4106223'

# Get category links
categories = get_category_links(music_n_nightlife)

data = []
for c in categories:
    temp = get_category_winner(c)
    data.append(temp)
    sleep(1) # suspend execution by 1 seconds

# Python: dump list/dict to json file
with open('result.json', 'wb') as outfile:
    json.dump(data, outfile)

# Pretty printing: use json.dumps
print json.dumps(data, indent = 4)
print '----------------------------------------------------------'  
print data

[
    {
        "category": "Best local band that's been around forever\u00a0", 
        "runners_up": [
            "Wilco", 
            "Funkadesi", 
            "\u201cAndrew Bird (please don\u2019t move to NY)\u201d"
        ], 
        "best": [
            "The Lawrence Arms"
        ], 
        "category_url": "http://www.chicagoreader.com/chicago/best-local-band-thats-been-around-forever/BestOf?oid=4108110"
    }, 
    {
        "category": "Best new local band\u00a0", 
        "runners_up": [
            "Fort Frances", 
            "Smith Westerns"
        ], 
        "best": [
            "The Sore Subjects"
        ], 
        "category_url": "http://www.chicagoreader.com/chicago/best-new-local-band/BestOf?oid=4093314"
    }, 
    {
        "category": "Best local rock band\u00a0", 
        "runners_up": [
            "Disappears", 
            "Milano"
        ], 
        "best": [
            "The Lawrence Arms"
        ], 
        "category_url": "http://www.chicagoread

In [79]:
# Python: dump list/dict to json file
import json
with open('result.json', 'wb') as outfile:
    json.dump(data, outfile)

# Reading a json file
with open('result.json') as infile:
    temp = json.load(infile)

print temp
print '--------------------------------------------------------'

# Pretty printing: use json.dumps
print json.dumps(data, indent = 4)

[{u'category': u'Best ultimate art institution\xa0', u'runners_up': [u'Chicago Cultural Center', u'The Field Museum'], u'category_url': u'http://www.chicagoreader.com/chicago/best-ultimate-art-institution/BestOf?oid=4070647', u'best': [u'The Art Institute of Chicago']}, {u'category': u'Best long-running play\xa0', u'runners_up': [u'Million Dollar Quartet at the Apollo Theater', u'Wicked at the Oriental Theatre '], u'category_url': u'http://www.chicagoreader.com/chicago/best-long-running-play/BestOf?oid=4070615', u'best': [u'Too Much Light Makes the Baby Go Blind, the Neo-Futurists \n']}, {u'category': u'Best new play\xa0', u'runners_up': [u'Sex With Strangers, Steppenwolf', u'There is a Happiness That Morning Is, Theatre Oobleck'], u'category_url': u'http://www.chicagoreader.com/chicago/best-new-play/BestOf?oid=4070650', u'best': [u'Musical of the Living Dead, Cowardly Scarecrow Theatre Company']}, {u'category': u'Best touring play\xa0', u'runners_up': [u'Black Watch, National Theatre 

**Need to use Exception handling later**

In [87]:
# Goods & Services: OK
# Music & Nightlife: OK
# Sports & Recreation: OK
# Food & Drink: OK
# Arts & Culture: Error! Need to use exception handling
# City Life: Error! Need to use exception handling

url = 'http://www.chicagoreader.com/chicago/best-of-chicago-2011-arts-culture/BestOf?oid=4106230'
categories = get_category_links(url)
for c in categories:
    print c

http://www.chicagoreader.com/chicago/best-ultimate-art-institution/BestOf?oid=4070647
http://www.chicagoreader.com/chicago/best-long-running-play/BestOf?oid=4070615
http://www.chicagoreader.com/chicago/best-new-play/BestOf?oid=4070650
http://www.chicagoreader.com/chicago/best-touring-play/BestOf?oid=4070995
http://www.chicagoreader.com/chicago/best-theater-company/BestOf?oid=4071022
http://www.chicagoreader.com/chicago/best-off-loop-theater-company/BestOf?oid=4071089
http://www.chicagoreader.com/chicago/best-stage-director/BestOf?oid=4071285
http://www.chicagoreader.com/chicago/best-local-actor/BestOf?oid=4071315
http://www.chicagoreader.com/chicago/best-local-actress/BestOf?oid=4071335
http://www.chicagoreader.com/chicago/best-stand-up/BestOf?oid=4071339
http://www.chicagoreader.com/chicago/best-sketchimprov-troupe/BestOf?oid=4071476
http://www.chicagoreader.com/chicago/best-venue-for-stand-up/BestOf?oid=4071696
http://www.chicagoreader.com/chicago/best-dance-troupe/BestOf?oid=4076750