# Scrape BeerAdvocate for beer ratings
This notebook scrapes BeerAdvocate for ratings of beer to subsequently perform linear regression on the ratings in terms of other available information on the beer. We start with the style page since we can use this to obtain a list of all beers of that style easily from there.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from fake_useragent import UserAgent
from collections import defaultdict

%matplotlib inline

## Useful helper classes and functions

In [2]:
start_url = "https://www.beeradvocate.com/beer/style/"

styles = os.path.join(os.path.curdir, "data", "styles.html")

if not os.path.exists(styles):
    os.makedirs("data")
    r = requests.get(start_url)
    page = r.text
    with open(styles, 'w') as file:
        file.write(page)
else:
    with open(styles, 'r') as file:
        page = file.read()

In [3]:
soup = BeautifulSoup(page,"lxml")

In [4]:
# Type = Ale, Lager, or Hybrid
type_tables = soup.find('table').findAll('table')

In [5]:
style_link_dict = dict()
for type_table in type_tables:
    beer_type = type_table.find('span').text.split(" ")[0]
    style_link_dict[beer_type] = defaultdict(list)
    for style in type_table.findAll('a'):
        style_name = "".join(style.text.split())
        style_link_dict[beer_type][style_name].append(
            "http://www.beeradvocate.com"+style['href']
        )
        

In [6]:
test_style_url = style_link_dict['Ale']['AmericanIPA'][0]
test_style = os.path.join(os.path.curdir, "data", "test_style.html")

if not os.path.exists(test_style):
    r = requests.get(test_style_url)
    soup = BeautifulSoup(r.text, "lxml")
    main_content = soup.find('table')
    with open(test_style, 'w') as file:
        file.write(str(main_content))
else:
    with open(test_style, 'r') as file:
        page = file.read()
        soup = BeautifulSoup(page, "lxml")
        main_content = soup.find('table')

In [10]:
soup.prettify()

'<html>\n <body>\n  <table border="0" cellpadding="2" cellspacing="0" width="100%">\n   <tr>\n    <td align="left" bgcolor="#000000" colspan="6" valign="top" width="100%">\n     <span style="color: #FFFFFF">\n      <b>\n       Style Examples - 1 to 50 (out of 22070) - Ordered by # Reviews\n      </b>\n     </span>\n    </td>\n   </tr>\n   <tr>\n    <td align="right" colspan="6" valign="top" width="100%">\n     <span style="font-weight:bold;">\n      <span class="muted">\n       first ← prev\n      </span>\n      |\n      <b>\n       1-50\n      </b>\n      |\n      <a href="/beer/style/116/?sort=revsD&amp;start=50">\n       51-100\n      </a>\n      |\n      <a href="/beer/style/116/?sort=revsD&amp;start=100">\n       101-150\n      </a>\n      |\n      <a href="/beer/style/116/?sort=revsD&amp;start=50">\n       next\n      </a>\n      →\n      <a href="/beer/style/116/?sort=revsD&amp;start=22050">\n       last\n      </a>\n     </span>\n    </td>\n   </tr>\n   <tr>\n    <td bgcolor="#