In [1]:
import urllib2
import os
import sys
import numpy
from bs4 import BeautifulSoup
import collections
from urlparse import urljoin
import csv
from beer.beer import Recipe, Ingredient, Yeasts, Fermentables, Hops

from lxml import etree

import numpy as np
import sklearn as skl
import matplotlib.pyplot as plt

In [2]:
dataDir = os.path.join('/'.join(os.getcwd().split('/')[0:-1]), 'data')

In [4]:
def download(url, recipe_range, download=False):
    recipes = []
    n = 0
    for num in range(recipe_range[0], recipe_range[1]):
        if num % 10 == 0:
            print 'Page %d' % num
        recipe_link = urljoin(url, "recipes?page=" + str(num) + "&sort=rank")
        index_file = os.path.join(dataDir, 'page_' + str(num) + '.html')
        if download or not os.path.isfile(index_file):
            contents = urllib2.urlopen(recipe_link).read()
            with open(index_file, 'w') as f:
                f.write(contents)
        else:
            with open(index_file, 'r') as f:
                contents = f.read()

        soup = BeautifulSoup(contents, 'html.parser')
        recipe_links = [a['href'] for a in soup.find_all("a", class_="recipe-link")]

        for relative_link in recipe_links:
            recipe = Recipe(dataDir, relative_link, url)
            if not recipe.parse():
                continue
            
            recipes.append(recipe)
            if len(recipes) % 100 == 0:
                print str(len(recipes)) + ' recipes done'
                
    return recipes

gRecipes = download("https://www.brewtoad.com/", (1, 25), download=False)

print 'Done parsing recipes, there were ' + str(len(gRecipes)) + ' recipes'

100 recipes done
200 recipes done
Page 10
300 recipes done
400 recipes done
Page 20
500 recipes done
Done parsing recipes, there were 531 recipes


In [6]:
print '%s has %d fermentables, %d hops, and %d yeasts' % (gRecipes[0].name, gRecipes[0].fermentables.num,
                                                          gRecipes[0].hops.num, gRecipes[0].yeast.num)
print 'There are %d total styles, %d fermentables, %d hops, and %d yeast' % (len(Recipe.styles), 
                                                                             len(Fermentables.fermentables),
                                                                             len(Hops.hops),
                                                                             len(Yeasts.yeasts))
print gRecipes[0].name + ' is a ' + gRecipes[0].style

Firework Cream Ale has 4 fermentables, 2 hops, and 1 yeasts
There are 67 total styles, 309 fermentables, 118 hops, and 7 yeast
Firework Cream Ale is a Cream Ale


In [7]:
recipe_counts = {}
for recipe in gRecipes:
    if recipe.style not in recipe_counts:
        recipe_counts[recipe.style] = 0
    recipe_counts[recipe.style] += 1

rc_list = []
for style, count in recipe_counts.iteritems():
    rc_list.append((style, count))
rc_list.sort(key=lambda r: r[1], reverse=True)
print rc_list

[('American IPA', 71), ('American Pale Ale', 46), ('Specialty Beer', 32), ('Extra Special/Strong Bitter (English Pale Ale)', 22), ('Saison', 17), ('Oatmeal Stout', 15), ('Weizen/Weissbier', 15), ('Belgian Specialty Ale', 13), ('Witbier', 13), ('Christmas/Winter Specialty Spiced Beer', 12), ('Sweet Stout', 12), ('Spice, Herb, or Vegetable Beer', 12), ('Russian Imperial Stout', 11), ('Imperial IPA', 11), ('American Amber Ale', 10), ('Belgian Dubbel', 10), ('Fruit Beer', 10), ('Robust Porter', 10), ('Irish Red Ale', 10), (u'K\xf6lsch', 9), ('English IPA', 9), ('American Brown Ale', 8), ('American Stout', 8), ('German Pilsner (Pils)', 8), ('Strong Scotch Ale', 7), ('Belgian Blond Ale', 7), ('Blonde Ale', 7), ('Northern English Brown Ale', 6), ('American Wheat or Rye Beer', 6), ('Belgian Golden Strong Ale', 6), ('Bohemian Pilsener', 6), ('Brown Porter', 5), ('Dry Stout', 5), ('English Barleywine', 5), (u'Oktoberfest/M\xe4rzen', 5), ('Scottish Export 80/-', 4), ('Belgian Pale Ale', 4), ('Bel

In [8]:
idx_to_fermentable = {}
for ferm, idx in Fermentables.fermentables.iteritems():
    idx_to_fermentable[idx] = ferm
    
print idx_to_fermentable[2]

idx_to_hop = {}
for hop, idx in Hops.hops.iteritems():
    idx_to_hop[idx] = hop
    
print idx_to_hop[2]

Crystal 15
Citra (US)


In [9]:
# creates the numpy arrays for the data and labels

subset = gRecipes
labels = numpy.array([recipe.style for recipe in subset])
data = numpy.array([recipe.to_data() for recipe in subset])

print labels[2]
print len(subset)

type_set = set()
for recipe in subset:
    for ferm in recipe.fermentables.data:
        if ferm[2].text == None:
            print ferm[0].text
    ferm_types = [ferm[2].text for ferm in recipe.fermentables.data]
    type_set = type_set.union(set(ferm_types))
print type_set

American IPA
531
Crisp Pale Ale Malt
toasted oats
Bairds Chocolate Malt
set([None, 'Caramel/Crystal Malt', 'Kilned Malt', 'Sugar', 'Adjunct', 'Roasted Malt', 'Dry Extract', 'Grain', 'Liquid Extract', 'Base Malt'])


In [None]:
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import tree
from sklearn import svm
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
import time

In [None]:
def normalize_recipe_data(r_data):
    data_T = numpy.transpose(r_data)
    data_T_n = preprocessing.normalize(data_T)
    data_n = numpy.transpose(data_T_n)
    return data_n

In [None]:
model_names = ['Logistic Regression', 'Naive Bayes', 'SVM SVC',
              'Decision Tree']

models = dict(zip(model_names, [
            linear_model.LogisticRegression(),
            naive_bayes.GaussianNB(),
            svm.SVC(),
            tree.DecisionTreeClassifier()
        ]))

model_accuracy = dict(zip(model_names, [0,0,0,0]))

totalTestSamples = 0

for i in range(0, 1):
    X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size = 0.2)
    

    model_results = {
        'Logistic Regression': linear_model.LogisticRegression(),
        'Naive Bayes': naive_bayes.GaussianNB(),
        'SVM SVC': svm.SVC()
    }
    
    if i == 0:
        print 'Recipe\'s Malts:'
        recipe_1 = X_test[0]
        for idx in range(0, len(Fermentables.fermentables)):
            if float(recipe_1[5 + idx]) > 0.0:
                print idx_to_fermentable[idx] + ': ' + str(recipe_1[5+idx])
        print '\nRecipe\'s Hops:'
        for idx in range(0, len(Hops.hops)):
            if float(recipe_1[5 + len(Fermentables.fermentables) + idx]) > 0.0:
                print idx_to_hop[idx] + ': ' + str(recipe_1[5+len(Fermentables.fermentables)+idx])
        print ''

    for name, model in models.iteritems():
        start = time.clock()
        model.fit(X_train, Y_train)
        stop = time.clock()
        fit_time = stop - start
        
        start = time.clock()
        model_results[name] = model.predict(X_test)
        stop = time.clock()
        predict_time = stop - start
        print '%s took %f to fit, %f to predict with test size %d' % (name, fit_time, predict_time,
                                                                     len(X_test))

    if i == 0:
        print 'Expected Style: %s' % Y_test[0]
        for name, results in model_results.iteritems():
            print '%s: %s' % (name, results[0])
        print ''

    numTestSamples = len(Y_test)
    totalTestSamples += numTestSamples
    for idx in range(0, numTestSamples):
        for model, val in model_accuracy.iteritems():
            if Y_test[idx] == model_results[model][idx]:
                model_accuracy[model] += 1

print 'There are %d total test samples' % totalTestSamples
for model, accuracy in model_accuracy.iteritems():
    print '%s: %.3f' % (model, accuracy / float(totalTestSamples))