# Scrape BeerAdvocate for beer ratings
This notebook scrapes BeerAdvocate for ratings of beer to subsequently perform linear regression on the ratings in terms of other available information on the beer. We start with the style page since we can use this to obtain a list of all beers of that style easily from there.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import random
import os
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from fake_useragent import UserAgent
from collections import defaultdict

%matplotlib inline

## Useful constants and functions

In [2]:
base_url = "https://www.beeradvocate.com"
start_url = base_url+"/beer/style/"

ua = UserAgent()

def download_parse_ba(style_file, url):
    """
    sytle_file: file to be written (including subdirectory)
    url: url to read from
    Reads a url from BeerAdvocate.com and dumps
    its main content into a local HTML file
    """
    
    user_agent = {'User-agent': ua.random}
    
    r = requests.get(url, headers = user_agent)
    soup = BeautifulSoup(r.text, "lxml")
    main_content = soup.find(id='ba-content')
    
    with open(style_file, 'w') as file:
        file.write(str(main_content))
    
    time.sleep(random.uniform(1, 3))
    
    return

def find_style_num(style_file):
    """
    style_file: html file in style list
    Returns the total number of beers in the style category
    """
    with open(style_file, 'r') as file:
        page = file.read()
    
    soup = BeautifulSoup(page,"lxml")
        
    # Find number of beers for the style

    beer_num_tag = soup.find('table').find('span').find('b').text
    
    # Find 'word' directly to the left of a parentheses
    criteria = re.compile('\w+\)')

    found = re.search(criteria, beer_num_tag)
    
    return int(found.group(0).split(')')[0])

def parse_beers_links(style_file, beer_dict):
    """
    style_file: html file in style list
    beer_links: reference to dictionary of beers to update
    Updates dictionary with each beer, its associated data,
    and a link to its review page
    Returns True if a beer on the page had less than 10 ratings
    Returns False otherwise
    """
    
    with open(style_file, 'r') as file:
        soup = BeautifulSoup(file.read(), "lxml")
        
    beer_rows = soup.find("table").findAll("tr")
    
    # Flag to stop reading entries when number of ratings is less than 10
    stop_flag = False
    
    for beer_row in beer_rows[3:len(beer_rows)-1]:
        table_entries = beer_row.findAll("td")
        namelink = table_entries[0].find("a")
    
        beer_link = namelink['href']
        beer_name = namelink.text
        
        brewery = table_entries[1].find("a").text
        
        abv = table_entries[2].find("span").text
        
        score = table_entries[3].find("b").text
        
        ratings = int(re.sub(",", "", table_entries[4].find("b").text))
        
        if ratings < 10:
            stop_flag = True
    
        beer_dict[beer_name] = [beer_link, brewery, abv, score, ratings]
    
    return beer_dict, stop_flag

## Grab first style file to start

In [3]:


styles = os.path.join(os.path.curdir, "data", "styles.html")

if not os.path.exists(styles):
    os.makedirs("data")
    r = requests.get(start_url)
    page = r.text
    with open(styles, 'w') as file:
        file.write(page)
else:
    with open(styles, 'r') as file:
        page = file.read()
        


In [4]:
soup = BeautifulSoup(page,"lxml")

In [5]:
# Type = Ale, Lager, or Hybrid
type_tables = soup.find('table').findAll('table')

In [6]:
style_link_dict = dict()
for type_table in type_tables:
    beer_type = type_table.find('span').text.split(" ")[0]
    style_link_dict[beer_type] = dict()
    for style in type_table.findAll('a'):
        style_name = "".join(style.text.split())
        style_name = re.sub("/", "", style_name)
        style_name = re.sub("&", "And", style_name)
        style_name = style_name.split("(")[0]
        style_name = re.sub("è", "e", style_name)
        style_name = re.sub("ö", "o", style_name)
        style_name = re.sub("ä", "a", style_name)
        style_link_dict[beer_type][style_name] = base_url+style['href']
        
        

In [7]:
style_link_dict['Hybrid']

{'FruitVegetableBeer': 'https://www.beeradvocate.com/beer/style/9/',
 'HerbedSpicedBeer': 'https://www.beeradvocate.com/beer/style/8/',
 'SmokedBeer': 'https://www.beeradvocate.com/beer/style/11/'}

## Grab first page of each style

In [8]:
for beer_type, styles in style_link_dict.items():
    
    beer_type_dir = os.path.join(os.path.curdir, "data", beer_type)
    
    if not os.path.exists(beer_type_dir):
        os.makedirs(beer_type_dir)
        
    for style, url in styles.items():
        style_type_dir = os.path.join(beer_type_dir, style)
        if not os.path.exists(style_type_dir):
            os.makedirs(style_type_dir)
    
        style_file = os.path.join(style_type_dir, style+".html")
        
        if not os.path.exists(style_file):
            download_parse_ba(style_file, url)

## Grab all pages that link to beer reviews based on first page

In [15]:
for beer_type, styles in style_link_dict.items():
    
    beer_type_dir = os.path.join(os.path.curdir, "data", beer_type)
    
    for style, url in styles.items():
        
        style_type_dir = os.path.join(beer_type_dir, style)
        
        first_style_file = os.path.join(style_type_dir, style+".html")
        
        beer_count = find_style_num(first_style_file)
        
        page_num = beer_count // 50
        
        print(page_num)
        
        # Get all pages that link to reviews of beer
        
        for i in range(1,page_num+1):
            
            style_file = os.path.join(style_type_dir, style+str(i)+".html")
            url_params = url + "?sort=revsD&start="+str(i*50)
            
        
            if not os.path.exists(style_file):
                download_parse_ba(style_file, url_params)
        
        print("Completed "+style)
        
    # Grab just ales
    break
        

122
Completed AmericanAmberRedAle
27
Completed AmericanBarleywine
46
Completed AmericanBlackAle
81
Completed AmericanBlondeAle
78
Completed AmericanBrownAle
3
Completed AmericanDarkWheatAle
171
Completed AmericanDoubleImperialIPA
87
Completed AmericanDoubleImperialStout
441
Completed AmericanIPA
243
Completed AmericanPaleAle
59
Completed AmericanPaleWheatAle
118
Completed AmericanPorter
96
Completed AmericanStout
19
Completed AmericanStrongAle
97
Completed AmericanWildAle
1
Completed BlackAndTan
7
Completed ChileBeer
28
Completed CreamAle
27
Completed PumpkinAle
29
Completed RyeBeer
4
Completed Wheatwine
14
Completed BelgianDarkAle
24
Completed BelgianIPA
58
Completed BelgianPaleAle
25
Completed BelgianStrongDarkAle
28
Completed BelgianStrongPaleAle
0
Completed BieredeChampagneBiereBrut
12
Completed BieredeGarde
25
Completed Dubbel
0
Completed Faro
5
Completed FlandersOudBruin
6
Completed FlandersRedAle
2
Completed Gueuze
8
Completed Lambic-Fruit
1
Completed Lambic-Unblended
15
Complet

## Just use ales for now. Lots of beer examples already

In [35]:
ale_dict = dict()

for style, url in style_link_dict["Ale"].items():
    
    style_dir = os.path.join("data", "Ale", style)
    
    first_style_file = os.path.join(style_dir, style+".html")
    
    beer_num = find_style_num(first_style_file)
    
    page_count = beer_num // 50
    ale_dict[style] = dict()
    
    ale_dict[style], stop_reading = parse_beers_links(first_style_file, ale_dict[style])
    
    if stop_reading:
        continue
    
    for i in range(1,page_count+1):
        style_file = os.path.join(style_dir, style+str(i)+".html")
        
        ale_dict[style], stop_reading = parse_beers_links(style_file, ale_dict[style])
        
        if stop_reading:
            break


In [36]:
len(ale_dict['AmericanIPA'])

5190

In [37]:
num_entries = 0
for style, values in ale_dict.items():
    num_entries += len(values)
    
num_entries

35762

In [75]:
ale_df_list = []
for style, data in ale_dict.items():
    temp_df = pd.DataFrame.from_dict(data, orient='index')
    temp_df['style'] = style
    ale_df_list.append(temp_df)

ale_df = pd.concat(ale_df_list)
ale_df.rename(columns={0: 'link', 1: 'brewery', 2: 'abv', 3:'score', 4:'ratings'}, inplace=True)

filtered_ale_df = ale_df.loc[ale_df.abv != ' ? ', :] 
filtered_ale_df2 = filtered_ale_df.loc[filtered_ale_df.score != '-', :]

filtered_ale_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34086 entries, Fat Tire Amber Ale to Grewit Barrel-aged Old World Ale
Data columns (total 6 columns):
link       34086 non-null object
brewery    34086 non-null object
abv        34086 non-null object
score      34086 non-null object
ratings    34086 non-null int64
style      34086 non-null object
dtypes: int64(1), object(5)
memory usage: 1.8+ MB


In [76]:
links = filtered_ale_df2.loc[:, 'link']


In [77]:
data_dir = os.path.join(os.path.curdir, "data")

filtered_ale_df2.to_csv(os.path.join(data_dir,'initial_data.csv'))
links.to_csv(os.path.join(data_dir,'links.csv'))