In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib import request, parse
import time

In [2]:
def get_soup(country):
    safe = parse.quote(country)
    url = 'http://www.pintprice.com/region.php?/{}/USD.htm'.format(safe)
    try:
        html = request.urlopen(url)
        if html.status == 200:
            return BeautifulSoup(html.read(), 'html.parser')
        else:
            return None
    except:
        return None

In [3]:
# Takes a country name, returns the table of cities and prices from that country's page on pintprice.com
def get_prices(country):
    soup = get_soup(country)
    if soup:
        table = []
        tds = [td.get_text().strip() for td in soup.find_all("td")]
        for x in range(0, len(tds), 2):
            table.append([tds[x], tds[x+1]])
        return table
    else:
        return None

In [6]:
html.read()

b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\r\n<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7"/>\r\n<title> United Kingdom - Pintprice, compare beer prices from around the world</title>\r\n<link href="css/styles.css" rel="stylesheet" type="text/css" />\r\n<!-- TradeDoubler site verification 1723893 -->\r\n<script type=\'text/javascript\'>\r\nvar googletag = googletag || {};\r\ngoogletag.cmd = googletag.cmd || [];\r\n(function() {\r\nvar gads = document.createElement(\'script\');\r\ngads.async = true;\r\ngads.type = \'text/javascript\';\r\nvar useSSL = \'https:\' == document.location.protocol;\r\ngads.src = (useSSL ? \'https:\' : \'http:\') + \r\n\'//www.googletagservices.com/tag/js/gpt.js\';\r\nvar node = document.getElementsByTagName(\'script\')[0];\r\nnode.paren

In [4]:
# Parse one page to get the list of all countries from the drop-down
uk = parse.quote("United Kingdom")
url = 'http://www.pintprice.com/region.php?/{}/USD.htm'.format(uk)
html = request.urlopen(url)

In [19]:
soup = BeautifulSoup(html.read(), 'html.parser')
countries = [c.get_text() for c in soup.find_all("option")][:-5]

In [12]:
with open('congo.htm', 'r', encoding="windows-1252") as f:
    html3 = BeautifulSoup(f.read(), 'html.parser')

In [15]:
str(html3)

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<!-- saved from url=(0050)http://www.pintprice.com/region.php?/Congo/USD.htm -->\n<html xmlns="http://www.w3.org/1999/xhtml"><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n<meta content="IE=EmulateIE7" http-equiv="X-UA-Compatible"/>\n<title> Congo - Pintprice, compare beer prices from around the world</title>\n<link href="./Congo - Pintprice, compare beer prices from around the world_files/styles.css" rel="stylesheet" type="text/css"/>\n<!-- TradeDoubler site verification 1723893 -->\n<script async="" crossorigin="anonymous" src="./Congo - Pintprice, compare beer prices from around the world_files/all.js"></script><script async="" src="./Congo - Pintprice, compare beer prices from around the world_files/gpt.js" type="text/javascript"></script><script type="text/javascript">\nvar googletag = googletag || {};\ngoogletag.cmd = googletag

In [28]:
import pickle

pickle.dump(html.read(), open("congo.pkl", 'wb'))

In [29]:
h2 = pickle.load(open("congo.pkl", 'rb'))

In [31]:
h2

b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\r\n<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7"/>\r\n<title> Congo - Pintprice, compare beer prices from around the world</title>\r\n<link href="css/styles.css" rel="stylesheet" type="text/css" />\r\n<!-- TradeDoubler site verification 1723893 -->\r\n<script type=\'text/javascript\'>\r\nvar googletag = googletag || {};\r\ngoogletag.cmd = googletag.cmd || [];\r\n(function() {\r\nvar gads = document.createElement(\'script\');\r\ngads.async = true;\r\ngads.type = \'text/javascript\';\r\nvar useSSL = \'https:\' == document.location.protocol;\r\ngads.src = (useSSL ? \'https:\' : \'http:\') + \r\n\'//www.googletagservices.com/tag/js/gpt.js\';\r\nvar node = document.getElementsByTagName(\'script\')[0];\r\nnode.parentNode.ins

In [12]:
# There's an extra <td> on the UK page, so let's skip it
pint_price = {}
uk_table = []
uk_tds = [td.get_text().strip() for td in soup.find_all("td")][1:]
for x in range(0, len(uk_tds), 2):
    uk_table.append([uk_tds[x], uk_tds[x+1]])
pint_price["United Kingdom"] = uk_table

In [13]:
for c in countries:
    if c != "United Kingdom":
        pint_price[c] = get_prices(c)
        time.sleep(0.1)

In [17]:
# Move Cost/Price to the column headings
pint_price_df = pd.DataFrame(pint_price['United Kingdom'])
pint_price_df.columns = pint_price_df.iloc[0]
pint_price_df = pint_price_df.drop(pint_price_df.index[0])

# set the Country column and clean up the Price column
pint_price_df["Country"] = "United Kingdom"
pint_price_df['Price'] = pint_price_df['Price'].str.strip('$ USD').astype(float)

In [24]:
pint_price_df["Price"].dtype == "float64"

True

In [7]:
# Do it again for each country and concat onto pint_price_df
for country in pint_price:
    if country != "United Kingdom":
        pp = pd.DataFrame(pint_price[country])
        pp.columns = pp.iloc[0]
        pp = pp.drop(pp.index[0])
        
        pp['Country'] = country
        pp['Price'] = pp['Price'].str.strip("$ USD").astype(float, errors="ignore")
        
        pint_price_df = pd.concat([pint_price_df, pp])

In [8]:
pint_price_df.columns = ['city_ascii', 'beer_pub', 'country']
pint_price_df = pint_price_df.reindex(columns=['city_ascii', 'country', 'beer_pub'])

In [9]:
pint_price_df = pint_price_df[pint_price_df['beer_pub'] != "npriced"]
pint_price_df['city_ascii'] = pint_price_df.city_ascii.str.title()

In [10]:
pint_price_df.to_csv("pintprice.csv")