In [18]:
from bs4 import BeautifulSoup as bs
import requests as r
import re
import json
import csv
import sys
sys.path.append("./scripts")
import warnings
warnings.filterwarnings('ignore')
import scrapebrew as sb

sb.scrape_brewer_friends()

In [142]:
import pandas as pd

beer_recipes = pd.read_csv("./data/brewer_scrape_test_small.csv", keep_default_na=False)
print(beer_recipes.head(5).brewed_count[0])
test = beer_recipes.head(5)
test.title_url[0]



373


[]

In [162]:
def scrape_recipes_details(title_url_list):
    """
        scrape_recipes_details maintains the control flow structure that allows for the scraping of 
        4954 pages of beer recipes. This function looks to extract detailed information for each beer recipe.
        Information such as ratings, reviews and view count that will be combined with the orignal set of information
        extracted.
        Args:
            title_url_list (array): An array of title urls that will be use to scrape more details about the 
            beer recipes.
            
    """
    general, reviews = [], []
    for i in range(len(title_url_list)):
        append_gen, append_reviews = scrape_detail_page(title_url_list[i])
        print("Completed scraping beer recipes details from: " + title_url_list[i])
        general.append(append_gen)
        reviews = reviews + append_reviews
    write_detail_recipe_to_csv(general)
    print("Completed writing detailed beer recipes to CSV")
    write_recipe_reviews_to_csv(reviews)
    print("Completed writing beer recipes reviews to CSV")

    

def write_detail_recipe_to_csv(recipes_list):
    """
        write_detail_recipe_to_csv takes in a recipes_list array and writes the data out into a CSV
        Args:
            recipe_list (array): An array of beer recipes with fields that were not readily available unless 
            viewed from a detail beer recipe page.
    """
    with open("data/brewer_scrape_detail_test.csv", mode='w', encoding='utf-8') as csv_file:
        field_names = ["title_url", "mash_ph", "rating_value", "rating_count", "view_count"]
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        for row in recipes_list:
            writer.writerow(row)
            
def write_recipe_reviews_to_csv(reviews_list):
    """
        write_recipe_reviews_to_csv takes in a reviews_list array and writes the data out into a CSV
        Args:
            reviews_list (array): An array of beer recipes reviews with an assortment of fields.
    """
    with open("data/brewer_scrape_reviews_test.csv", mode='w', encoding='utf-8') as csv_file:
        field_names = ["title_url", "reviewer_url", "reviewer_name", "review_description",
                       "review_rating", "reviewer_date", "reviewer_time"]
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        for row in reviews_list:
            writer.writerow(row)

def scrape_detail_page(page_url):
    """
        scrape_detail_page takes in a page_url string used to extract detail information and reviews for that 
        particular beer recipe url.
        Args:
            page_url (string): An string that represents a beer recipe page url
        Return:
            Returns two arrays. The first contains additional general information about a beer recipe. 
            The other contains reviews left for that beer recipe.
    """
    page = r.get(page_url, headers={'User-Agent':'Mozilla/5.0'})
    soup = bs(page.content, "html.parser")
    
    page_general_details = extract_general_recipe_detail(soup, page_url)
    reviews_list = extract_recipe_reviews(soup, page_url)
    return page_general_details, reviews_list
    
    
def extract_general_recipe_detail(soup, title_url):
    """
        extract_general_recipe_detail takes in a BeautifulSoup object that represents the HTML makeup of the particular
        url (title_url) for a particular beer recipe. This function extracts general information about the beer recipe
        used to be combine to an existing data set.
        Args:
            soup (object): A BeautifulSoup object that represents the HTML makeup of a page.
            title_url (string): An string that represents a beer recipe page url
        Return:
            Returns an object that represents additional general information for a beer recipe
    """
    result = {}
    mash_ph = soup.find("div", {"class": "phMin"}).text.strip()
    
    result["title_url"] = title_url
    
    result["mash_ph"] = mash_ph
    rating_div = soup.find("div", {"class": "reviews"})
    rating_value = "N/A"
    rating_count = 0
    if (rating_div != None):
        rating_value = rating_div.find("span", {"itemprop": "ratingValue"}).text.strip()
        rating_count = rating_div.find("span", {"itemprop": "reviewCount"}).text.strip()
    result["rating_value"] = rating_value
    result["rating_count"] = rating_count

    view_count_div = soup.find("div", {"class": "lastupdated"})
    view_count_bold = view_count_div.findAll("b")
    view_count = view_count_bold[0].text.replace("View Count: ", "").strip()
    result["view_count"] = view_count
    
    return result

    
    
def extract_recipe_reviews(soup, title_url):
    """
        extract_general_recipe_detail takes in a BeautifulSoup object that represents the HTML makeup of the particular
        url (title_url) for a particular beer recipe. This function extracts reviews about the beer recipe
        used to be combine to an existing data set.
        Args:
            soup (object): A BeautifulSoup object that represents the HTML makeup of a page.
            title_url (string): An string that represents a beer recipe page url
        Return:
            Returns an array of review object
    """
    brewpart_divs = soup.findAll("div", {"class": "brewpart"})
    brewpart_review_div = brewpart_divs[len(brewpart_divs) - 1]
    brewpart_review_tables = brewpart_review_div.find("table").find("td").findAll("table")
    reviews = []
    
    if (len(brewpart_review_tables) != 0):
        for i in range(len(brewpart_review_tables)):
            review = {}

            review["title_url"] =  title_url

            review_tds = brewpart_review_tables[i].findAll("td")
            review_td = review_tds[len(review_tds) - 1]

            reviewer_url = "N/A"
            reviewer_name = "N/A"
            if (review_td.find("a") != None):
                reviewer_url = review_td.find("a")["href"]
                reviewer_name = review_td.find("a").text
            else: 
                reviewer_name = review_td.find("font").text

            review["reviewer_url"] = reviewer_url
            review["reviewer_name"] = reviewer_name

            review_fonts = review_td.findAll("font")
            review_td_datetime = review_fonts[1].text.split("at")
            review_date = review_td_datetime[0].replace("•", "").strip()
            review["reviewer_date"] = review_date

            review_time = review_td_datetime[1].strip()
            review["reviewer_time"] = review_time

            review_description = "N/A"
            if (review_fonts[len(review_fonts) - 1] != None):  
                review_description = review_fonts[len(review_fonts) - 1].text.strip()
            review["review_description"] = review_description

            review_rating = "N/A"
            review_rating_span = brewpart_review_tables[i].find("span", {"class": "blue"})
            if (review_rating_span != None):
                review_rating = review_rating_span.text.replace("of 5", "").strip()
            review["review_rating"] = review_rating

            reviews.append(review)
    return reviews
    


In [163]:
beer_recipes = pd.read_csv("./data/brewer_scrape_test_small.csv", keep_default_na=False)
testing_data = beer_recipes.head(5)
testing_url = testing_data.title_url

scrape_recipes_details(testing_url)

Completed scraping beer recipes details from: https://www.brewersfriend.com/homebrew/recipe/view/28546/sierra-nevada-pale-ale-clone
Completed scraping beer recipes details from: https://www.brewersfriend.com/homebrew/recipe/view/363082/avg-perfect-northeast-ipa-neipa-
Completed scraping beer recipes details from: https://www.brewersfriend.com/homebrew/recipe/view/5916/zombie-dust-clone-all-grain
Completed scraping beer recipes details from: https://www.brewersfriend.com/homebrew/recipe/view/5920/zombie-dust-clone-extract
Completed scraping beer recipes details from: https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone
Completed writing detailed beer recipes to CSV
Completed writing beer recipes reviews to CSV
