# Scrape BeerAdvocate for beer ratings
This notebook scrapes BeerAdvocate for ratings of beer to subsequently perform linear regression on the ratings in terms of other available information on the beer. We start with the style page since we can use this to obtain a list of all beers of that style easily from there.

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import random
import os
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from fake_useragent import UserAgent
from collections import defaultdict

%matplotlib inline

## Useful constants and functions

In [None]:
base_url = "https://www.beeradvocate.com"
start_url = base_url+"/beer/style/"

ua = UserAgent()

def download_parse_ba(style_file, url):
    """
    sytle_file: file to be written (including subdirectory)
    url: url to read from
    Reads a url from BeerAdvocate.com and dumps
    its main content into a local HTML file
    """
    
    user_agent = {'User-agent': ua.random}
    
    r = requests.get(url, headers = user_agent)
    soup = BeautifulSoup(r.text, "lxml")
    main_content = soup.find(id='ba-content')
    
    with open(style_file, 'w') as file:
        file.write(str(main_content))
    
    time.sleep(random.uniform(2, 4))
    
    return

def find_style_num(style_file):
    """
    style_file: html file in style list
    Returns the total number of beers in the style category
    """
    with open(style_file, 'r') as file:
        page = file.read()
    
    soup = BeautifulSoup(page,"lxml")
        
    # Find number of beers for the style

    beer_num_tag = soup.find('table').find('span').find('b').text
    
    # Find 'word' directly to the left of a parentheses
    criteria = re.compile('\w+\)')

    found = re.search(criteria, beer_num_tag)
    
    return int(found.group(0).split(')')[0])

def parse_beers_links(style_file, beer_dict):
    """
    style_file: html file in style list
    beer_links: reference to dictionary of beers to update
    Updates dictionary with each beer, its associated data,
    and a link to its review page
    Returns True if a beer on the page had less than 10 ratings
    Returns False otherwise
    """
    
    with open(style_file, 'r') as file:
        soup = BeautifulSoup(file.read(), "lxml")
        
    beer_rows = soup.find("table").findAll("tr")
    
    # Flag to stop reading entries when number of ratings is less than 10
    stop_flag = False
    
    for beer_row in beer_rows[3:len(beer_rows)-1]:
        table_entries = beer_row.findAll("td")
        namelink = table_entries[0].find("a")
    
        beer_link = namelink['href']
        beer_name = namelink.text
        
        beer_name = re.sub("\/", "", beer_name)
        beer_name = re.sub("\.", "", beer_name)
        
        brewery = table_entries[1].find("a").text
        brewery = re.sub("\/", "", brewery)
        brewery = re.sub("\.", "", brewery)
        
        abv = table_entries[2].find("span").text
        
        score = table_entries[3].find("b").text
        
        ratings = int(re.sub(",", "", table_entries[4].find("b").text))
        
        if ratings < 10:
            stop_flag = True
    
        beer_dict[beer_name+'-'+brewery] = [beer_link, brewery, abv, score, ratings]
    
    return beer_dict, stop_flag

def parse_beer_reviews(beer_dict):
    """
    Takes a dictionary of beers and links
    Returns a dataframe of beer information
    """
    beer_dir = os.path.join(os.path.curdir, "data", "beers")
    
    beer_rating_dict = dict()
    
    for beer in beer_dict.keys():
        
        beer_path = os.path.join(beer_dir, beer+".html")

        with open(beer_path, 'r') as file:
            page = file.read()

        soup = BeautifulSoup(page, "lxml")
        
        try:
            ba_score = soup.find(class_="BAscore_big ba-score").text
        except:
            print(beer+".html")
            continue

        if(ba_score == '-'):
            ba_score = np.nan
        else:
            ba_score = int(ba_score)

        beer_stats = soup.find(id="item_stats")
        ratings = beer_stats.find(class_="ba-ratings").text
        reviews = beer_stats.find(class_="ba-reviews").text
        perc_dev = beer_stats.find(class_="ba-pdev").text
        perc_dev = re.sub('\s+','',perc_dev)
        wants = beer_stats.find(class_="ba-wants").text
        gots = beer_stats.find(class_="ba-gots").text

        criteria = re.compile("[1-5]\.?\d*\s\|")

        look = []
        smell = []
        taste = []
        feel = []

        ratings = soup.findAll(id="rating_fullview_content_2")
        for rating in ratings:
            found = re.findall(criteria, rating.find(class_="muted").text)
            if(found):
                look.append(float(found[0].split(' ')[0]))
                smell.append(float(found[1].split(' ')[0]))
                taste.append(float(found[2].split(' ')[0]))
                feel.append(float(found[3].split(' ')[0]))

        try:
            look_avg = sum(look)/len(look)
            smell_avg = sum(smell)/len(smell)
            taste_avg = sum(taste)/len(taste)
            feel_avg = sum(feel)/len(feel)
        except:
            look_avg = np.nan
            smell_avg = np.nan
            taste_avg = np.nan
            feel_avg = np.nan
        
        beer_rating_dict[beer] = {'ba_score' : ba_score, 'ratings' : ratings, 'reviews' : reviews,
                                 'perc_dev' : perc_dev, 'wants' : wants, 'gots' : gots, 'look' : look_avg,
                                 'smell' : smell_avg, 'taste' : taste_avg, 'feel' : feel_avg}
    return beer_rating_dict

## Grab first style file to start

In [None]:


styles = os.path.join(os.path.curdir, "data", "styles.html")

if not os.path.exists(styles):
    os.makedirs("data")
    r = requests.get(start_url)
    page = r.text
    with open(styles, 'w') as file:
        file.write(page)
else:
    with open(styles, 'r') as file:
        page = file.read()
        


In [None]:
soup = BeautifulSoup(page,"lxml")

In [None]:
# Type = Ale, Lager, or Hybrid
type_tables = soup.find('table').findAll('table')

In [None]:
style_link_dict = dict()
for type_table in type_tables:
    beer_type = type_table.find('span').text.split(" ")[0]
    style_link_dict[beer_type] = dict()
    for style in type_table.findAll('a'):
        style_name = "".join(style.text.split())
        style_name = re.sub("/", "", style_name)
        style_name = re.sub("&", "And", style_name)
        style_name = style_name.split("(")[0]
        style_name = re.sub("è", "e", style_name)
        style_name = re.sub("ö", "o", style_name)
        style_name = re.sub("ä", "a", style_name)
        style_link_dict[beer_type][style_name] = base_url+style['href']
        
        

In [None]:
style_link_dict['Hybrid']

## Grab first page of each style

In [None]:
for beer_type, styles in style_link_dict.items():
    
    beer_type_dir = os.path.join(os.path.curdir, "data", beer_type)
    
    if not os.path.exists(beer_type_dir):
        os.makedirs(beer_type_dir)
        
    for style, url in styles.items():
        style_type_dir = os.path.join(beer_type_dir, style)
        if not os.path.exists(style_type_dir):
            os.makedirs(style_type_dir)
    
        style_file = os.path.join(style_type_dir, style+".html")
        
        if not os.path.exists(style_file):
            download_parse_ba(style_file, url)

## Grab all pages that link to beer reviews based on first page

In [None]:
for beer_type, styles in style_link_dict.items():
    
    beer_type_dir = os.path.join(os.path.curdir, "data", beer_type)
    
    for style, url in styles.items():
        
        style_type_dir = os.path.join(beer_type_dir, style)
        
        first_style_file = os.path.join(style_type_dir, style+".html")
        
        beer_count = find_style_num(first_style_file)
        
        page_num = beer_count // 50
        
        print(page_num)
        
        # Get all pages that link to reviews of beer
        
        for i in range(1,page_num+1):
            
            style_file = os.path.join(style_type_dir, style+str(i)+".html")
            url_params = url + "?sort=revsD&start="+str(i*50)
            
        
            if not os.path.exists(style_file):
                download_parse_ba(style_file, url_params)
        
        print("Completed "+style)
        
    # Grab just ales
    break
        

## Get data from pages on ales

In [None]:
ale_dict = dict()

for style, url in style_link_dict["Ale"].items():
    
    style_dir = os.path.join("data", "Ale", style)
    
    first_style_file = os.path.join(style_dir, style+".html")
    
    beer_num = find_style_num(first_style_file)
    
    page_count = beer_num // 50
    ale_dict[style] = dict()
    
    ale_dict[style], stop_reading = parse_beers_links(first_style_file, ale_dict[style])
    
    if stop_reading:
        continue
    
    for i in range(1,page_count+1):
        style_file = os.path.join(style_dir, style+str(i)+".html")
        
        ale_dict[style], stop_reading = parse_beers_links(style_file, ale_dict[style])
        
        if stop_reading:
            break


In [None]:
len(ale_dict['AmericanIPA'])

In [None]:
num_entries = 0
for style, values in ale_dict.items():
    num_entries += len(values)
    
num_entries

In [None]:
ale_df_list = []
for style, data in ale_dict.items():
    temp_df = pd.DataFrame.from_dict(data, orient='index')
    temp_df['style'] = style
    ale_df_list.append(temp_df)

ale_df = pd.concat(ale_df_list)
ale_df.rename(columns={0: 'link', 1: 'brewery', 2: 'abv', 3:'score', 4:'ratings'}, inplace=True)

filtered_ale_df = ale_df.loc[ale_df.abv != ' ? ', :] 
filtered_ale_df2 = filtered_ale_df.loc[filtered_ale_df.score != '-', :]

filtered_ale_df2.info()

In [None]:
links = filtered_ale_df2.loc[:, 'link']


In [None]:
data_dir = os.path.join(os.path.curdir, "data")

filtered_ale_df2.to_csv(os.path.join(data_dir,'initial_data.csv'))
links.to_csv(os.path.join(data_dir,'links.csv'))

## Get links for first 25 reviews

In [None]:
links_dict = links.to_dict()



In [None]:
beer_path = os.path.join("data", "beers")

if not os.path.exists(beer_path):
    os.makedirs(beer_path)
i = 0
for beer, link in links_dict.items():
    beer_file = os.path.join(beer_path, beer+".html")
        
    if not os.path.exists(beer_file):
        download_parse_ba(beer_file, base_url+link)
        
    i += 1
    #if not (i % 100):
    #    time.sleep(60)
    

## Parse downloaded pages for beer to get more numerical data

In [None]:
beer_reviews_dict = parse_beer_reviews(links_dict)