In [177]:
import urllib2
import os
import numpy
from bs4 import BeautifulSoup
import collections
from urlparse import urljoin
import csv

In [11]:
dataDir = os.path.join('/'.join(os.getcwd().split('/')[0:-1]), 'data')

/Users/hirschb/Personal/beer-learning/data


In [202]:
class Ingredient:
    def __init__(self, ingredient_html):
        self.headings = []
        self.data = []
        
        self.parse(ingredient_html)
        
    def parse(self, html):
        for heading in html.thead.tr.find_all("th"):
            self.headings.append(heading.get_text())
        for ferm in html.tbody.find_all("tr"):
            f_dat = []
            
            fermentable = [f.get_text() for f in ferm.find_all("td")]
            # Cleaning up names a bit
            for field in fermentable:
                field_name = field.encode('ascii', 'ignore').strip()
                
                # turn \n into spaces
                field_name = ''.join([f if f != '\n' else ' ' for f in field_name])
                
                # remove multiple spaces
                name = []
                space = False
                for f in field_name:
                    if f == ' ':
                        if not space:
                            space = True
                            name.append(f)
                    else:
                        if space:
                            space = False
                        name.append(f)
                f_dat.append(''.join(name))
            self.data.append(f_dat)

class Fermentables(Ingredient):
    def __init__(self, fermentables_html):
        Ingredient.__init__(self, fermentables_html)
        
class Hops(Ingredient):
    def __init__(self, hops_html):
        Ingredient.__init__(self, hops_html)

class Yeast(Ingredient):
    def __init__(self, yeast_html):
        Ingredient.__init__(self, yeast_html)
        
class Recipe:
    def __init__(self, relative_link, site_url):
        self.url = urljoin(site_url, relative_link)
        self.name = relative_link.split('/')[-1]
        self.filename = os.path.join(dataDir, 'recipes_html', self.name + '.html')
        self.csvfilename = os.path.join(dataDir, 'recipes', self.name + '.csv')
        
        self.stats = {}
        self.recipe_html = None
        
        print self.name
        
    """
    Downloads the recipe to the recipes_html directory
    """
    def download(self):
        recipe_html = urllib2.urlopen(self.url).read()
        with open(self.filename, 'w') as f:
            f.write(recipe_html)

    """
    Parses the Recipe from a file on the filesystem 
    in dataDir / recipes_html / recipe_name
    
    If download=True this will force a re-download and 
                                     re-parse into csv
    """
    def parse(self, download=False, reparse=False):
        if download or not os.path.isfile(self.filename):
            self.download()
        
        if not download and not reparse and os.path.isfile(self.csvfilename):
            with open(self.filename) as f:
                recipe_csv = csv.reader(f)
                print 'Parsing ' + self.name + ' with CSV'
                self.parseCSV()
                return
        
        if self.recipe_html is None:
            with open(self.filename) as f:
                recipe_html = f.read()
            
        recipesoup = BeautifulSoup(recipe_html, 'html.parser')
        
        # Parses the stats, OG/FG/etc
        stats_html = recipesoup.find_all("div", class_="recipe-show--stats")[0]
        for stat in stats_html.find_all("div", class_="horizontal-bar-graph"):
            label = stat.find("div", class_="label").get_text()
            value = stat.find("div", class_="value").get_text()
            if label == 'ABV':
                value = value[0:-1]

            if label in ['OG', 'FG', 'ABV']:
                value = float(value)
            else:
                value = int(value)
            self.stats[label] = value

        ingredients_html = recipesoup.find_all("div", class_="recipe-show--ingredients")[0]
        for ingredient_html in ingredients_html.find_all("table"):
            ingredient = ingredient_html['id']
            if ingredient == 'fermentables':
                self.fermentables = Fermentables(ingredient_html)
            elif ingredient == 'hops':
                self.hops = Hops(ingredient_html)
            elif ingredient == 'yeasts':
                self.yeast = Yeast(ingredient_html)
                
                
    def parseCSV(self):
        with open(self.csvfilename, 'r') as f:
            recipe_csv = csv.reader(self.csvfilename)
            self.stats = recipe_csv.next()
            print self.stats
            
    def writeCSV(self):
        lines = [ ','.join(self.stats.iterkeys()),
                ','.join([str(v) for v in self.stats.itervalues()]) ]
        def addIngredient(ingr):
            lines.append(','.join(self.fermentables.headings))
            for d in ingr.data:
                lines.append(','.join(d))
        addIngredient(self.fermentables)
        addIngredient(self.hops)
        addIngredient(self.yeast)
        with open(self.csvfilename, 'w') as f:
            f.write('\n'.join(lines))


In [203]:
def download(url, recipe_link, download=False, reparse=False):
    recipe_link = url + recipe_link
    index_file = os.path.join(dataDir, 'index.html')
    if not os.path.isfile(index_file):
        contents = urllib2.urlopen(recipe_link).read()
        with open(index_file, 'w') as f:
            f.write(contents)
    else:
        with open(index_file, 'r') as f:
            contents = f.read()
    
    soup = BeautifulSoup(contents, 'html.parser')
    recipe_links = [a['href'] for a in soup.find_all("a", class_="recipe-link")]
    
    for relative_link in recipe_links:
        recipe = Recipe(relative_link, url)
        recipe.parse(download=download, reparse=reparse)
        recipe.writeCSV()

download("https://www.brewtoad.com/", "recipes?page=1&sort=rank", reparse=True)
print 'Done downloading'

firework-cream-ale
[['8.0 lb', 'American 2-Row', 'Rahr', 'Mash', '38', '1 L'], ['2.0 lb', 'Corn, Flaked', 'Any', 'Mash', '37', '1 L'], ['1.0 lb', 'Crystal 15', 'Great Western', 'Mash', '10', '15 L'], ['1.0 lb', 'CarapilsMalt', 'Briess', 'Mash', '34', '1 L']]
cascade-ipa-e15d96
[['11.5 lb', 'American 2-Row', 'Rahr', 'Mash', '38', '1 L'], ['0.5 lb', 'Caramel Malt 60L', 'Briess', 'Mash', '34', '60 L']]
3-floyds-zombie-dust-clone
[['12.75 lb', '2-Row (US)', 'Any', 'Mash', '37', '1 L'], ['1.25 lb', 'Munich Malt 10L', 'Briess', 'Mash', '35', '10 L'], ['0.5 lb', 'CARAPILS', 'Weyermann', 'Mash', '33', '1 L'], ['0.5 lb', 'Melanoidin Malt', 'Weyermann', 'Mash', '37', '26 L'], ['0.5 lb', 'Caramel Malt 60L', 'Briess', 'Mash', '34', '60 L']]
nut-brown-ale-102
[['8.0 lb', 'Maris Otter', 'Bairds', 'Mash', '35', '3 L'], ['1.0 lb', 'Oats, Flaked', 'Any', 'Mash', '37', '1 L'], ['1.0 lb', 'VictoryMalt', 'Briess', 'Mash', '34', '28 L'], ['1.0 lb', 'Caramel Malt 120L', 'Briess', 'Mash', '32', '120 L'], ['0