<a href="https://colab.research.google.com/github/aeholbrook/mash_scraper/blob/main/mash_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get update -q
!sudo apt-get install iputils-ping -q

In [1]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm 
import numpy as np
import concurrent.futures
import time

In [124]:
def load_url(url, timeout):
    #time1 = time.time()
    ans = requests.get(url, timeout=timeout, headers={'User-Agent': 'Mozilla/5.0','Content-Range': '5499-5500'})
    #time2 = time.time()
    #print("request_time:", time2-time1)
    return (ans,url)

def parse_brew_data(response, href, url_base="https://www.brewersfriend.com"):

  soup_2 = BeautifulSoup(response.text, "html.parser")

  name = soup_2.find("h3",{"itemprop":"name"})
  name = (name.prettify().split("\n")[1][1:] if name != None else "NaN")

  style = soup_2.find("span",{"class":"viewStats"})
  style = (style.prettify().split("\n")[5][2:] if style != None else "NaN")

  abv = soup_2.find("div",{"class":"abvMin"})
  abv = (abv.prettify().split("\n")[1][1:-1] if abv != None else "NaN")

  ibu = soup_2.find("div",{"class":"ibuMin"})
  ibu = (ibu.prettify().split("\n")[1][1:] if ibu != None else "NaN")

  srm = soup_2.find("div",{"class":"srmMin"})
  srm = (srm.prettify().split("\n")[1][1:] if srm != None else "NaN")

  score = soup_2.find("div",{"class":"reviews"})
  score = (score.prettify().split("\n")[25][3:] if score != None else "NaN" )

  reviews = soup_2.find("div",{"class":"reviews"})
  reviews = (reviews.prettify().split("\n")[29][3:] if reviews != None else "NaN")
  
  try:
    df_list = pd.read_html(response.content) # this parses all the tables in webpages to a list
    fermentables = df_list[0][:-1].replace({'%':''}, regex=True)
    hops = df_list[1][:-1].replace({'%':''}, regex=True)
    other = df_list[3]
  except:
    fermentables=None
    hops = None
    other = None

  return { 
    "name" : name,
    "style" : style,
    "ABV" : abv,
    "IBU" : ibu,
    "SRM" : srm,
    "score" : score,
    "reviews" : reviews,
    "fermentables" : fermentables,
    "hops" : hops,
    "other" : other,
    "link" : href
    }

def get_brew_data(hrefs, url_base="https://www.brewersfriend.com"): 

  #desc = "Collecting data from page " + str(page_number)

  out = []
  hrefs_split = np.array_split(hrefs,4)

  for hrefs in hrefs_split:
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
      future_to_url = (executor.submit(load_url, url_base+href, 300) for href in hrefs)
      #time1 = time.time()
      out.extend([future.result() for future in concurrent.futures.as_completed(future_to_url)])
      #time2 = time.time()

  #print(f'\n Took {time2-time1:.2f} s')
  

  brew_data = [parse_brew_data(response[0], response[1], url_base) for response in out]

  return brew_data

def iterate_hrefs(start=1,stop=2):

  url_base = "https://www.brewersfriend.com"
  with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    future_to_url = (executor.submit(load_url, url_base+'/homebrew-recipes/page/'+str(page_number), 100) for page_number in range(start,stop+1))
    
    href_list = [[item.get('href') 
      for item in BeautifulSoup(data.result()[0].text, 
      "html.parser").find_all("a", {"class": "recipetitle"})] 
      for data in concurrent.futures.as_completed(future_to_url)]

  return np.concatenate([get_brew_data(hrefs) for hrefs in tqdm(href_list,desc=str("Reading pages "+str(start)+" to "+str(stop)),position=0)])
  

In [125]:
test = iterate_hrefs(1,1)

Reading pages 1 to 1: 100%|██████████| 1/1 [01:01<00:00, 61.83s/it]


In [76]:
print(([float(item[('IBU')]) for item in test_2]))

print([item['fermentables'].size for item in test_2])

[61.25, 10.77, 14.69, 47.13, 83.71, 18.92, 20.8, 59.97, 28.21, 14.81, 126.11, 33.46, 21.99, 23.49, 38.31, 27.34, 27.0, 56.68, 42.38, 23.57, 59.26, 62.42, 19.44, 39.79, 232.89, 70.18, 50.22, 21.27, 31.36, 98.09, 18.54, 63.62, 36.45, 41.11, 55.96, 45.32, 69.72, 71.54, 22.81, 64.26, 52.15, 72.32, 40.9, 51.93, 33.9, 54.51, 68.21, 33.0, 98.86, 24.28, 22.6, 35.43, 30.71, 34.83, 40.7, 38.07, 42.53, 77.09, 69.23, 23.57, 10.12, 34.21, 72.7, 28.56, 19.6, 26.69, 34.91, 12.18, 29.03, 123.51, 105.67, 67.25, 32.49, 24.96, 70.13, 26.45, 32.02, 28.69, 18.9, 89.8, 36.24, 16.26, 60.4, 33.02, 23.21, 62.68, 56.83, 93.0, 10.24, 15.61, 11.99, 14.7, 27.74, 27.41, 43.53, 12.35, 35.03, 113.05, 16.6, 69.25, 16.79, 24.66, 34.94, 26.46, 0.0, 49.47, 77.71, 17.39, 20.0, 87.48, 41.97, 52.44, 22.25, 38.2, 22.93, 31.75, 25.55, 73.29, 35.01, 33.67, 75.05, 28.22, 46.15, 25.48, 36.65, 33.4, 93.98, 67.07, 26.8, 33.99, 36.3, 54.32, 5.04, 22.58, 105.36, 60.95, 33.48, 20.26, 74.52, 54.52, 59.62, 34.0, 19.2, 81.59, 13.95, 24.

In [85]:
asdf = test_2[np.where([thing["reviews"]!="NaN" for thing in test_2])]
print(len(test_2[np.where([int(test["reviews"])>3 for test in asdf])]))

asdf2 = test_2[np.where(["" in thing["hops"] for thing in test_2])]
print(len(asdf2))

24
0


In [261]:
import string
asdf = np.concatenate([item['hops'][item['hops'].columns[1]] for item in test_2])

dashes = [thing for thing in asdf[np.where([(" - ") in i for i in asdf])]]

print(len(dashes) / len(asdf))


cleaned = np.unique(list(map(lambda x: string.capwords(x), cleaned)))
cleaned = np.unique(list(map(lambda x: "Caramel / Crystal" if "Caramel / Crystal" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Dr. Rudi" if "Dr Rudi" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Fuggle" if "Fuggle" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Hopshot" if "Hopshot" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Golding" if "Golding" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Warrior" if "Warrior" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Simcoe" if "Simcoe" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Carapils" if "Carapils" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Simcoe" if "Simcoe" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Hallertau" if "Hallertau" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Cascade" if "Cascade" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Columbus" if "Columbus" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Amarillo" if "Amarillo" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Dark Crystal" if "Ark Crystal" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Tettnang" if "Tetnang" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Hop Extract" if "Hop Extract" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Centennial" if "Centennial" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Ctz" if "Ctz" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Chinook" if "Chinook" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Tettnang" if "Tetnang" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: "Munich Light" if "Light 10l" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[:-7] if " (U.K.)" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[21:] if "Yakima Valley Hops" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[11:] if "American" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[8:] if "German" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[10:] if "Canadian" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[18:] if "United Kingdom" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[3:] if "US " in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[:-7] if " (U.K.)" in x else x, cleaned)))
cleaned = np.unique(list(map(lambda x: x[10:] if "Belgian" in x else x, cleaned)))

print(len(cleaned)/len(np.unique(asdf)))
print(cleaned)



0.05343007915567282
0.6580645161290323
['Admiral' 'Ahtanum' 'Ale Chocolate' 'Allertauer' 'Amarillo' 'Apollo'
 'Aromatic' 'Azacca' 'Biscuit' 'Black Malt' 'Bramling Cross' 'Bravo'
 "Brewer's Gold" 'Cacsade' 'Calypso' 'Campden Tablet' 'Carafoam'
 'Caramel / Crystal' 'Caramel Wheat' 'Carapils' 'Cascade' 'Centennial'
 'Challenger' 'Chinook' 'Chocolate' 'Citra' 'Cluster' 'Columbus' 'Comet'
 'Crosby Hop Farm - Denali' 'Crystal' 'Ctz' 'Dark Crystal'
 'De-bittered Black' 'Dr. Rudi' 'El Dorado' 'Elect' 'Er (german)'
 'Flaked Barley' 'Flaked Corn' 'Flaked Oats' 'Fuggle' 'Galaxy' 'Glacier'
 'Golding' 'Hallertau' 'Hbc342' 'Hersbrucker' 'Honey Malt' 'Hop Extract'
 'Hopshot' 'Horizon' 'Lack Patent' 'Lemon Drop' 'Liberty' 'Magnum'
 'Mandarina Bavaria' 'Melanoidin' 'Mosaic' 'Motueka' 'Mount Hood'
 'Munich Light' 'Nelson Sauvin' 'Northdown' 'Northern Brewer' 'Nugget'
 'Opal' 'Pacific Gem' 'Pale 2-row' 'Pearle' 'Pectic Enzyme' 'Perle'
 'Phoenix' 'Pride Of Ringwood' 'Rakau' 'Roasted Barley' 'Rye' 'Rystal 

In [121]:

cleaned = np.unique(list(map(lambda x: "Simcoe" if "Simcoe" in x else x, cleaned)))test_2[5]["hops"]

Unnamed: 0,Amount,Variety,Cost,Type,AA,Use,Time,IBU,Bill %
0,50 g,Saaz,,Pellet,2.3,Boil,60 min,13.72,41.7
1,30 g,Saaz,,Pellet,2.3,Boil,15 min,4.08,25.0
2,10 g,Amarillo,,Pellet,9.2,Boil,1 min,0.47,8.3
3,10 g,Mosaic,,Pellet,12.5,Boil,1 min,0.64,8.3
4,10 g,Amarillo,,Pellet,9.2,Dry Hop,7 days,,8.3
5,10 g,Mosaic,,Pellet,12.5,Dry Hop,7 days,,8.3
