# Scrape BeerAdvocate for beer ratings
This notebook scrapes BeerAdvocate for ratings of beer to subsequently perform linear regression on the ratings in terms of other available information on the beer. We start with the style page since we can use this to obtain a list of all beers of that style easily from there.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import random
import os
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from fake_useragent import UserAgent
from collections import defaultdict

%matplotlib inline

## Useful constants and functions

In [25]:
base_url = "https://www.beeradvocate.com"
start_url = base_url+"/beer/style/"

ua = UserAgent()

def download_parse_ba(style_file, url):
    """
    sytle_file: file to be written (including subdirectory)
    url: url to read from
    Reads a url from BeerAdvocate.com and dumps
    its main content into a local HTML file
    """
    
    user_agent = {'User-agent': ua.random}
    
    r = requests.get(url, headers = user_agent)
    soup = BeautifulSoup(r.text, "lxml")
    main_content = soup.find(id='ba-content')
    
    with open(style_file, 'w') as file:
        file.write(str(main_content))
    
    time.sleep(random.uniform(1, 3))
    
    return

## Grab first style file to start

In [16]:


styles = os.path.join(os.path.curdir, "data", "styles.html")

if not os.path.exists(styles):
    os.makedirs("data")
    r = requests.get(start_url)
    page = r.text
    with open(styles, 'w') as file:
        file.write(page)
else:
    with open(styles, 'r') as file:
        page = file.read()
        


In [17]:
soup = BeautifulSoup(page,"lxml")

In [18]:
# Type = Ale, Lager, or Hybrid
type_tables = soup.find('table').findAll('table')

In [40]:
style_link_dict = dict()
for type_table in type_tables:
    beer_type = type_table.find('span').text.split(" ")[0]
    style_link_dict[beer_type] = dict()
    for style in type_table.findAll('a'):
        style_name = "".join(style.text.split())
        style_name = re.sub("/", "", style_name)
        style_name = re.sub("&", "And", style_name)
        style_name = style_name.split("(")[0]
        style_name = re.sub("è", "e", style_name)
        style_name = re.sub("ö", "o", style_name)
        style_name = re.sub("ä", "a", style_name)
        style_link_dict[beer_type][style_name] = base_url+style['href']
        
        

In [41]:
style_link_dict['Hybrid']

{'FruitVegetableBeer': 'https://www.beeradvocate.com/beer/style/9/',
 'HerbedSpicedBeer': 'https://www.beeradvocate.com/beer/style/8/',
 'SmokedBeer': 'https://www.beeradvocate.com/beer/style/11/'}

## Grab first page of each style

In [24]:
for beer_type, styles in style_link_dict.items():
    
    beer_type_dir = os.path.join(os.path.curdir, "data", beer_type)
    
    if not os.path.exists(beer_type_dir):
        os.makedirs(beer_type_dir)
        
    for style, url in styles.items():
        style_type_dir = os.path.join(beer_type_dir, style)
        if not os.path.exists(style_type_dir):
            os.makedirs(style_type_dir)
    
        style_file = os.path.join(style_type_dir, style+".html")
        
        if not os.path.exists(style_file):
            download_parse_ba(style_file, url)

In [37]:

    
    

test_page = os.path.join(os.path.curdir, "data", "Ale", "AmericanIPA", "AmericanIPA.html")

with open(test_page, 'r') as file:
    page = file.read()
    
soup = BeautifulSoup(page,"lxml")

beer_num = soup.find('table').find('span').find('b').text

criteria = re.compile('\w+\)')

found = re.search(criteria, beer_num)
int(found.group(0).split(')')[0])

22083

## Grab all pages that link to beer reviews based on first page

In [62]:
for beer_type, styles in style_link_dict.items():
    
    beer_type_dir = os.path.join(os.path.curdir, "data", beer_type)
    
    for style, url in styles.items():
        
        style_type_dir = os.path.join(beer_type_dir, style)
        
        first_style_file = os.path.join(style_type_dir, style+".html")
        
        with open(first_style_file, 'r') as file:
            page = file.read()
    
        soup = BeautifulSoup(page,"lxml")
        
        # Find number of beers for the style

        beer_num_tag = soup.find('table').find('span').find('b').text

        criteria = re.compile('\w+\)')

        found = re.search(criteria, beer_num_tag)
        review_count = int(found.group(0).split(')')[0])
        
        page_num = review_count // 50
        
        # Get all pages that link to reviews of beer
        
        for i in range(1,page_num+1):
            
            style_file = os.path.join(style_type_dir, style+str(i)+".html")
            url_params = url + "?sort=revsD&start="+str(i*50)
            
        
            if not os.path.exists(style_file):
                download_parse_ba(style_file, url_params)
        

KeyboardInterrupt: 