<a href="https://colab.research.google.com/github/aeholbrook/2020_commencement_report_pipeline/blob/main/mash_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get update -q
!sudo apt-get install iputils-ping -q

In [1]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm 
import numpy as np
import concurrent.futures
import time

In [124]:
def load_url(url, timeout):
    #time1 = time.time()
    ans = requests.get(url, timeout=timeout, headers={'User-Agent': 'Mozilla/5.0','Content-Range': '5499-5500'})
    #time2 = time.time()
    #print("request_time:", time2-time1)
    return (ans,url)

def parse_brew_data(response, href, url_base="https://www.brewersfriend.com"):

  soup_2 = BeautifulSoup(response.text, "html.parser")

  name = soup_2.find("h3",{"itemprop":"name"})
  name = (name.prettify().split("\n")[1][1:] if name != None else "NaN")

  style = soup_2.find("span",{"class":"viewStats"})
  style = (style.prettify().split("\n")[5][2:] if style != None else "NaN")

  abv = soup_2.find("div",{"class":"abvMin"})
  abv = (abv.prettify().split("\n")[1][1:-1] if abv != None else "NaN")

  ibu = soup_2.find("div",{"class":"ibuMin"})
  ibu = (ibu.prettify().split("\n")[1][1:] if ibu != None else "NaN")

  srm = soup_2.find("div",{"class":"srmMin"})
  srm = (srm.prettify().split("\n")[1][1:] if srm != None else "NaN")

  score = soup_2.find("div",{"class":"reviews"})
  score = (score.prettify().split("\n")[25][3:] if score != None else "NaN" )

  reviews = soup_2.find("div",{"class":"reviews"})
  reviews = (reviews.prettify().split("\n")[29][3:] if reviews != None else "NaN")
  
  try:
    df_list = pd.read_html(response.content) # this parses all the tables in webpages to a list
    fermentables = df_list[0][:-1].replace({'%':''}, regex=True)
    hops = df_list[1][:-1].replace({'%':''}, regex=True)
    other = df_list[3]
  except:
    fermentables=None
    hops = None
    other = None

  return { 
    "name" : name,
    "style" : style,
    "ABV" : abv,
    "IBU" : ibu,
    "SRM" : srm,
    "score" : score,
    "reviews" : reviews,
    "fermentables" : fermentables,
    "hops" : hops,
    "other" : other,
    "link" : href
    }

def get_brew_data(hrefs, url_base="https://www.brewersfriend.com"): 

  #desc = "Collecting data from page " + str(page_number)

  out = []
  hrefs_split = np.array_split(hrefs,4)

  for hrefs in hrefs_split:
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
      future_to_url = (executor.submit(load_url, url_base+href, 300) for href in hrefs)
      #time1 = time.time()
      out.extend([future.result() for future in concurrent.futures.as_completed(future_to_url)])
      #time2 = time.time()

  #print(f'\n Took {time2-time1:.2f} s')
  

  brew_data = [parse_brew_data(response[0], response[1], url_base) for response in out]

  return brew_data

def iterate_hrefs(start=1,stop=2):

  url_base = "https://www.brewersfriend.com"
  with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    future_to_url = (executor.submit(load_url, url_base+'/homebrew-recipes/page/'+str(page_number), 100) for page_number in range(start,stop+1))
    
    href_list = [[item.get('href') 
      for item in BeautifulSoup(data.result()[0].text, 
      "html.parser").find_all("a", {"class": "recipetitle"})] 
      for data in concurrent.futures.as_completed(future_to_url)]

  return np.concatenate([get_brew_data(hrefs) for hrefs in tqdm(href_list,desc=str("Reading pages "+str(start)+" to "+str(stop)),position=0)])
  

In [125]:
test = iterate_hrefs(1,1)

Reading pages 1 to 1: 100%|██████████| 1/1 [01:01<00:00, 61.83s/it]


In [76]:
print(([float(item[('IBU')]) for item in test_2]))

print([item['fermentables'].size for item in test_2])

[61.25, 10.77, 14.69, 47.13, 83.71, 18.92, 20.8, 59.97, 28.21, 14.81, 126.11, 33.46, 21.99, 23.49, 38.31, 27.34, 27.0, 56.68, 42.38, 23.57, 59.26, 62.42, 19.44, 39.79, 232.89, 70.18, 50.22, 21.27, 31.36, 98.09, 18.54, 63.62, 36.45, 41.11, 55.96, 45.32, 69.72, 71.54, 22.81, 64.26, 52.15, 72.32, 40.9, 51.93, 33.9, 54.51, 68.21, 33.0, 98.86, 24.28, 22.6, 35.43, 30.71, 34.83, 40.7, 38.07, 42.53, 77.09, 69.23, 23.57, 10.12, 34.21, 72.7, 28.56, 19.6, 26.69, 34.91, 12.18, 29.03, 123.51, 105.67, 67.25, 32.49, 24.96, 70.13, 26.45, 32.02, 28.69, 18.9, 89.8, 36.24, 16.26, 60.4, 33.02, 23.21, 62.68, 56.83, 93.0, 10.24, 15.61, 11.99, 14.7, 27.74, 27.41, 43.53, 12.35, 35.03, 113.05, 16.6, 69.25, 16.79, 24.66, 34.94, 26.46, 0.0, 49.47, 77.71, 17.39, 20.0, 87.48, 41.97, 52.44, 22.25, 38.2, 22.93, 31.75, 25.55, 73.29, 35.01, 33.67, 75.05, 28.22, 46.15, 25.48, 36.65, 33.4, 93.98, 67.07, 26.8, 33.99, 36.3, 54.32, 5.04, 22.58, 105.36, 60.95, 33.48, 20.26, 74.52, 54.52, 59.62, 34.0, 19.2, 81.59, 13.95, 24.

In [264]:
asdf = test_2[np.where([thing["reviews"]!="NaN" for thing in test_2])]
print(len(test_2[np.where([int(test["reviews"])>3 for test in asdf])]))

asdf2 = test_2[np.where(["" in thing["hops"] for thing in test_2])]
print(len(asdf2))

24
0


In [476]:
nom = np.unique([item["score"] for item in filter(lambda i: i["score"] != "NaN",test_2)])
print(nom)

['1.00' '2.00' '3.00' '3.75' '3.80' '4.00' '4.33' '4.50' '4.67' '4.69'
 '4.70' '4.71' '4.76' '4.79' '4.80' '4.82' '4.88' '5.00']


In [466]:
asdfg = np.unique(np.concatenate([item['fermentables'][item['fermentables'].columns[1]] for item in test_2]))
asdfg_old = len(asdfg)
fermentables_pipeline = [
  lambda x: string.capwords(x),
  lambda x: x.replace(" - "," "),
  lambda x: "Caramel / Crystal" if "Caramel / Crystal" in x else x,
  lambda x: "Caramel / Crystal" if "/crystal" in x else x,
  lambda x: "Crystal" if "Crystal " in x else x,
  lambda x: "Carapils" if "Carapils" in x else x,
  lambda x: "Candi" if "Candi" in x else x,
  lambda x: "Munich" if "Munich" in x else x,
  lambda x: "Milk Sugar" if "Lactose" in x else x,
  lambda x: "Sucrose" if "Sucrose" in x else x,
  lambda x: "Apple" if "Apple" in x else x,
  lambda x: "Caramunich" if "Caramunich" in x else x,
  lambda x: "Pale Malt" if "Pale Malt" in x else x,
  lambda x: "Pilsner" if "Pilsner" in x else x,
  lambda x: "Roasted Barley" if "Roasted Barley" in x else x,
  lambda x: "Wheat" if "Wheat" in x else x,
  lambda x: "Black Patent Malt" if "atent" in x else x,
  lambda x: "Dextrose" if "extrose" in x else x,
  lambda x: "Brown Sugar" if "Brown Sugar" in x else x,
  lambda x: "Turbinado" if "Turbinado" in x else x,
  lambda x: "Maris Otter" if "Maris Otter" in x else x,
  lambda x: "Carafa" if "Carafa" in x else x,
  lambda x: x[:-7] if " (U.K.)" in x else x,
  lambda x: x.replace("American ",""),
  lambda x: x.replace("German ",""),
  lambda x: x.replace(" (german)",""),
  lambda x: x.replace("(late Addition)",""),
  lambda x: x.replace("(Late Addition)",""),
  lambda x: x.replace("(last 15 Min of Boil)",""),
  lambda x: x.replace("(late Addition)",""),
  lambda x: x.replace("Canadian ",""),
  lambda x: x.replace("Thomas Fawcett ",""),
  lambda x: x.replace("United Kingdom ",""),
  lambda x: x.replace("New Zealand ",""),
  lambda x: x.replace("Belgian ",""),
  lambda x: x.replace("Ireland ",""),
  lambda x: x.replace("Proximity ",""),
  lambda x: x.replace("Weyermann ",""),
  lambda x: x.replace("N/a ",""),
  lambda x: x.replace("Us ",""),
  lambda x: x.replace("Briess ",""),
]
#ok now let's apply each of those lambda functions to the data list thing
for x in fermentables_pipeline:
  asdfg = list(map(x,asdfg))

print(len(np.unique(asdfg)) / asdfg_old)

dummies_1 = 

['Crisp Malting - Chocolate Malt', 'Muntons Chocolate Malt', 'Thomas Fawcett - Chocolate Malt', 'Thomas Fawcett Chocolate Malt']
0.46959459459459457


In [358]:
import string
asdf = np.concatenate([item['hops'][item['hops'].columns[1]] for item in test_2])

hops_pipeline = [
  lambda x: string.capwords(x),
  lambda x: "Caramel / Crystal" if "Caramel / Crystal" in x else x,
  lambda x: "Dr. Rudi" if "Dr Rudi" in x else x,
  lambda x: "Fuggle" if "Fuggle" in x else x,
  lambda x: "Hopshot" if "Hopshot" in x else x,
  lambda x: "Golding" if "Golding" in x else x,
  lambda x: "Warrior" if "Warrior" in x else x,
  lambda x: "Simcoe" if "Simcoe" in x else x,
  lambda x: "Carapils" if "Carapils" in x else x,
  lambda x: "Simcoe" if "Simcoe" in x else x,
  lambda x: "Hallertau" if "Hallertau" in x else x,
  lambda x: "Cascade" if "Cascade" in x else x,
  lambda x: "Columbus" if "Columbus" in x else x,
  lambda x: "Amarillo" if "Amarillo" in x else x,
  lambda x: "Dark Crystal" if "Dark Crystal" in x else x,
  lambda x: "Tettnang" if "Tetnang" in x else x,
  lambda x: "Hop Extract" if "Hop Extract" in x else x,
  lambda x: "Centennial" if "Centennial" in x else x,
  lambda x: "Ctz" if "Ctz" in x else x,
  lambda x: "Saaz" if "Saaz" in x else x,
  lambda x: "Chinook" if "Chinook" in x else x,
  lambda x: "Tetnang" if "Tettnang" in x else x,
  lambda x: "Munich Light" if "Light 10l" in x else x,
  lambda x: "Northern Brewer" if "Northern Brewer" in x else x,
  lambda x: x[:-7] if " (U.K.)" in x else x,
  lambda x: x[21:] if "Yakima Valley Hops" in x else x,
  lambda x: x[11:] if "American" in x else x,
  lambda x: x[9:] if "German - " in x else x,
  lambda x: x[:-9] if "(german)" in x else x,
  lambda x: x[11:] if "Canadian" in x else x,
  lambda x: x[17:] if "United Kingdom" in x else x,
  lambda x: x[3:] if "Us " in x else x,
  lambda x: x[10:] if "Belgian" in x else x
  ]

  

arr = asdf
for x in hops_pipeline:
  arr = list(map(x,arr))
print(np.unique(arr))


['Admiral' 'Ahtanum' 'Amarillo' 'Apollo' 'Aromatic' 'Azacca' 'Biscuit'
 'Black Malt' 'Black Patent' 'Bramling Cross' 'Bravo' "Brewer's Gold"
 'Cacsade' 'Calypso' 'Campden Tablet' 'Carafoam' 'Caramel / Crystal'
 'Caramel Wheat' 'Carapils' 'Cascade' 'Centennial' 'Challenger' 'Chinook'
 'Chocolate' 'Citra' 'Cluster' 'Columbus' 'Comet'
 'Crosby Hop Farm - Denali' 'Crystal' 'Crystal 90l' 'Ctz' 'Dark Crystal'
 'De-bittered Black' 'Dr. Rudi' 'El Dorado' 'Flaked Barley' 'Flaked Corn'
 'Flaked Oats' 'Fuggle' 'Galaxy' 'German Select' 'Glacier' 'Golding'
 'Hallertau' 'Hbc342' 'Hersbrucker' 'Honey Malt' 'Hop Extract' 'Hopshot'
 'Horizon' 'Lemon Drop' 'Liberty' 'Magnum' 'Mandarina Bavaria'
 'Melanoidin' 'Mosaic' 'Motueka' 'Mount Hood' 'Munich Light'
 'Nelson Sauvin' 'Northdown' 'Northern Brewer' 'Nugget' 'Opal'
 'Pacific Gem' 'Pale 2-row' 'Pale Chocolate' 'Pearle' 'Pectic Enzyme'
 'Perle' 'Phoenix' 'Pride Of Ringwood' 'Rakau' 'Roasted Barley' 'Rye'
 'Saaz' 'Saphir' 'Simcoe' 'Smoked Malt' 'Sorachi A

In [262]:
dtest_2[5]["hops"]

SyntaxError: ignored