<a href="https://colab.research.google.com/github/aeholbrook/mash_scraper/blob/main/mash_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm import tqdm 
import numpy as np
import threading
import concurrent.futures
import time

In [163]:
def load_url(url, timeout):
    ans = requests.get(url, timeout=timeout, headers={'User-Agent': 'Mozilla/5.0'})
    return (ans,url)

def parse_brew_data(response, href, url_base="https://www.brewersfriend.com"):


  soup_2 = BeautifulSoup(response.content, "html.parser")

  name = soup_2.find("h3",{"itemprop":"name"})
  name = name.prettify().split("\n")[1][1:] if name != None else name = "NaN"

  style = soup_2.find("span",{"class":"viewStats"})
  style = style.prettify().split("\n")[5][2:] if style != None else style = "NaN"

  abv = soup_2.find("div",{"class":"abvMin"})
  abv = abv.prettify().split("\n")[1][1:-1] if abv != None else abv = "NaN"

  ibu = soup_2.find("div",{"class":"ibuMin"})
  ibu = ibu.prettify().split("\n")[1][1:] if ibu != None else ibu = "NaN"

  srm = soup_2.find("div",{"class":"srmMin"})
  srm = srm.prettify().split("\n")[1][1:] if srm != None else srm = "NaN"

  score = soup_2.find("div",{"class":"reviews"})
  score = score.prettify().split("\n")[25][3:] if score != None else score = "NaN" 

  reviews = soup_2.find("div",{"class":"reviews"})
  reviews = reviews.prettify().split("\n")[29][3:] if reviews != None else reviews = "NaN" \
  
  try:
    df_list = pd.read_html(response.content) # this parses all the tables in webpages to a list
    fermentables = df_list[0][:-1].replace({'%':''}, regex=True)
    hops = df_list[1][:-1].replace({'%':''}, regex=True)
    other = df_list[3]
  except:
    fermentables=["NaN"]
    hops = ["NaN"]
    other = ["NaN"]



  return { 
    "name" : name,
    "style" : style,
    "ABV" : abv,
    "IBU" : ibu,
    "SRM" : srm,
    "score" : score,
    "reviews" : reviews,
    "fermentables" : fermentables,
    "hops" : hops,
    "other" : other,
    "link" : href
    }

def get_brew_data(hrefs, url_base="https://www.brewersfriend.com"): 

  #desc = "Collecting data from page " + str(page_number)

  out = []

  with concurrent.futures.ThreadPoolExecutor(max_workers=21) as executor:
    future_to_url = (executor.submit(load_url, url_base+href, 100) for href in hrefs)
    time1 = time.time()
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            data = future.result()
        except Exception as exc:
            data[0] = str(type(exc))
        finally:
            out.append(data)

    time2 = time.time()

  #print(f'\n Took {time2-time1:.2f} s')
  

  brew_data = [parse_brew_data(response[0], response[1], url_base) for response in out]

  return brew_data

def iterate_hrefs(start=1,stop=2):

  url_base = "https://www.brewersfriend.com"
  with concurrent.futures.ThreadPoolExecutor(max_workers=21) as executor:
    future_to_url = (executor.submit(load_url, url_base+'/homebrew-recipes/page/'+str(page_number), 100) for page_number in range(start,stop+1))
    
    href_list = [[item.get('href') 
      for item in BeautifulSoup(data.result()[0].content, 
      "html.parser").find_all("a", {"class": "recipetitle"})] 
      for data in concurrent.futures.as_completed(future_to_url)]

  return np.concatenate([get_brew_data(hrefs) for hrefs in tqdm(href_list,desc=str("Reading pages "+str(start)+" to "+str(stop)),position=0)])
  

In [165]:
test = iterate_hrefs(1,50)

Reading pages 1 to 50: 100%|██████████| 50/50 [11:38<00:00, 13.96s/it]


In [166]:
test.shape

(860,)

In [156]:
clf2 = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=100,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

60
[{'name': 'Miller Lite Clone', 'style': 'All Grain', 'ABV': '4.41', 'IBU': '14.81', 'SRM': '2.48', 'score': None, 'reviews': None, 'fermentables':     Amount               Fermentable  Cost PPG   °L Bill %
0     3 lb     American - Pale 2-Row   NaN  37  1.8   39.7
1  2.25 lb     American - Pale 6-Row   NaN  35  1.8   29.8
2     2 lb               Flaked Corn   NaN  40  0.5   26.4
3     5 oz  German - Acidulated Malt   NaN  27  3.4    4.1, 'hops':     Amount  Variety  Cost    Type AA      Use     Time    IBU Bill %
0  0.50 oz  Cascade   NaN  Pellet  7     Boil   60 min  14.81     25
1  0.50 oz  Cascade   NaN  Pellet  7     Boil    0 min    NaN     25
2     1 oz  Cascade   NaN  Pellet  7  Dry Hop  14 days    NaN     50, 'other':                                 Amount  ...                                 Time
0                                  NaN  ...                               90 min
1  Starting Mash Thickness:  1.5 qt/lb  ...  Starting Mash Thickness:  1.5 qt/lb

[2 rows x 5 colu