Building a Recommendation System in Python
============================
> In this tutorial we'll show you how to build a recommendation system using pandas, scikit-learn, and numpy. We've provided a dataset of beer reviews which we'll use for building our product recommender, but this use case could be easily substituted with a different product.

In [2]:
import pandas as pd
import numpy as np
import pylab as pl

<h2><a href="https://s3.amazonaws.com/demo-datasets/beer_reviews.tar.gz">Download the data</a></h2>
<p>Grab the dataset from our data demos bucket on S3, then decompress it. It will create a directory called ~/Downloads/beer_reviews.</p>

In [3]:
cd /Users/mpgartland1/Documents/Documents/Courses/Predictive Models/week 6/beer_reviews

/Users/mpgartland1/Documents/Documents/Courses/Predictive Models/week 6/beer_reviews


In [4]:
# substitute your name here. If you're on windows you'll need a different filepath
df = pd.read_csv("beer_reviews.csv")
df.head(15)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883
5,1075,Caldera Brewing Company,1325524659,3.0,3.5,3.5,oline73,Herbed / Spiced Beer,3.0,3.5,Caldera Ginger Beer,4.7,52159
6,1075,Caldera Brewing Company,1318991115,3.5,3.5,3.5,Reidrover,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159
7,1075,Caldera Brewing Company,1306276018,3.0,2.5,3.5,alpinebryant,Herbed / Spiced Beer,2.0,3.5,Caldera Ginger Beer,4.7,52159
8,1075,Caldera Brewing Company,1290454503,4.0,3.0,3.5,LordAdmNelson,Herbed / Spiced Beer,3.5,4.0,Caldera Ginger Beer,4.7,52159
9,1075,Caldera Brewing Company,1285632924,4.5,3.5,5.0,augustgarage,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159


## Finding People Who Have Reviewed 2 Beers

In [5]:
beer_1, beer_2 = "Dale's Pale Ale", "Fat Tire Amber Ale"

beer_1_reviewers = df[df.beer_name==beer_1].review_profilename.unique()
beer_2_reviewers = df[df.beer_name==beer_2].review_profilename.unique()
common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
print "Users in the sameset: %d" % len(common_reviewers)
list(common_reviewers)[:10]

Users in the sameset: 499


['womencantsail',
 'Marty30',
 'Winter',
 'Lothore',
 'bump8628',
 'gford217',
 'lackenhauser',
 'wspscott',
 'mjurney',
 'LiquidBread219']

## Extracting Reviews

In [6]:
def get_beer_reviews(beer, common_users):
    mask = (df.review_profilename.isin(common_users)) & (df.beer_name==beer)
    reviews = df[mask].sort('review_profilename')
    reviews = reviews[reviews.review_profilename.duplicated()==False]
    return reviews
beer_1_reviews = get_beer_reviews(beer_1, common_reviewers)
beer_2_reviews = get_beer_reviews(beer_2, common_reviewers)

cols = ['beer_name', 'review_profilename', 'review_overall', 'review_aroma', 'review_palate', 'review_taste']
beer_2_reviews[cols].head()

Unnamed: 0,beer_name,review_profilename,review_overall,review_aroma,review_palate,review_taste
202456,Fat Tire Amber Ale,ATPete,4.5,4.0,4.0,4.5
201458,Fat Tire Amber Ale,AdamBear,3.5,2.5,4.5,3.5
201886,Fat Tire Amber Ale,AlCaponeJunior,2.0,3.0,3.5,3.0
202481,Fat Tire Amber Ale,AltBock,4.0,3.0,3.0,3.0
201803,Fat Tire Amber Ale,Andreji,4.0,4.5,4.0,4.0


## Calculating Distance

In [16]:
# choose your own way to calculate distance
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import *
from scipy.stats.stats import pearsonr


ALL_FEATURES = ['review_overall', 'review_aroma', 'review_palate', 'review_taste']
def calculate_similarity(beer1, beer2):
    # find common reviewers
    beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique()
    beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique()
    common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)

    # get reviews
    beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
    beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
    dists = []
    for f in ALL_FEATURES:
        #dists.append(euclidean_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
         
        dists.append(manhattan_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
    
    return dists

calculate_similarity(beer_1, beer_2)

[292.0, 307.5, 272.5, 308.5]

## Calculate the Similarity for a Set of Beers

In [18]:
# calculate only a subset for the demo
beers = ["Dale's Pale Ale", "Sierra Nevada Pale Ale", "Michelob Ultra",
        "Natural Light", "Bud Light", "Fat Tire Amber Ale", "Coors Light",
         "Blue Moon Belgian White", "60 Minute IPA", "Guinness Draught", "Old Rasputin Russian Imperial Stout",
         "90 Minute IPA","Sierra Nevada Celebration Ale","Two Hearted Ale","Arrogant Bastard Ale","Pliny The Elder",
         "Sierra Nevada Bigfoot Barleywine Style Ale","La Fin Du Monde","Trappistes Rochefort 10","Ayinger Celebrator Doppelbock",
         "St. Bernardus Abt 12","Imperial Stout", "Samuel Adams Boston Lager","Duvel","Dead Guy Ale","Orval Trappist Ale",
         "Weihenstephaner Hefeweissbier", "Budweiser","Samuel Smith's Oatmeal Stout","Samuel Adams Octoberfest"
         ]

# calculate everything for real production
# beers = df.beer_name.unique()

simple_distances = []
for beer1 in beers:
    print "starting", beer1
    for beer2 in beers:
        if beer1 != beer2:
            row = [beer1, beer2] + calculate_similarity(beer1, beer2)
            simple_distances.append(row)

starting Dale's Pale Ale
starting Sierra Nevada Pale Ale
starting Michelob Ultra
starting Natural Light
starting Bud Light
starting Fat Tire Amber Ale
starting Coors Light
starting Blue Moon Belgian White
starting 60 Minute IPA
starting Guinness Draught
starting Old Rasputin Russian Imperial Stout
starting 90 Minute IPA
starting Sierra Nevada Celebration Ale
starting Two Hearted Ale
starting Arrogant Bastard Ale
starting Pliny The Elder
starting Sierra Nevada Bigfoot Barleywine Style Ale
starting La Fin Du Monde
starting Trappistes Rochefort 10
starting Ayinger Celebrator Doppelbock
starting St. Bernardus Abt 12
starting Imperial Stout
starting Samuel Adams Boston Lager
starting Duvel
starting Dead Guy Ale
starting Orval Trappist Ale
starting Weihenstephaner Hefeweissbier
starting Budweiser
starting Samuel Smith's Oatmeal Stout
starting Samuel Adams Octoberfest


## Inspect the Results

In [1]:
cols = ["beer1", "beer2", "overall_dist", "aroma_dist", "palate_dist", "taste_dist"]
simple_distances = pd.DataFrame(simple_distances, columns=cols)
simple_distances.tail()

NameError: name 'pd' is not defined

## Allow the User to Customize the Weights

In [21]:
def calc_distance(dists, beer1, beer2, weights):
    mask = (dists.beer1==beer1) & (dists.beer2==beer2)
    row = dists[mask]
    row = row[['overall_dist', 'aroma_dist', 'palate_dist', 'taste_dist']]
    dist = weights * row
    return dist.sum(axis=1).tolist()[0]

weights = [2, 1, 1, 1]
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Dale's Pale Ale", weights)
print calc_distance(simple_distances, "Fat Tire Amber Ale", "Michelob Ultra", weights)

1472.5
2146.5


## Find Similar Beers for Coors Light

In [22]:
my_beer = "Fat Tire Amber Ale"
results = []
for b in beers:
    if my_beer!=b:
        results.append((my_beer, b, calc_distance(simple_distances, my_beer, b, weights)))
sorted(results, key=lambda x: x[2])[0:4]

[('Fat Tire Amber Ale', "Dale's Pale Ale", 1472.5),
 ('Fat Tire Amber Ale', 'Natural Light', 1516.5),
 ('Fat Tire Amber Ale', 'Blue Moon Belgian White', 1629.0),
 ('Fat Tire Amber Ale', 'Samuel Adams Octoberfest', 1631.5)]

## Wrap it in Yhat

In [23]:
from yhat import Yhat, BaseModel

class BeerRec(BaseModel):
    
    def transform(self, raw_data):
        beer = raw_data['beer']
        weights = raw_data.get("weights", [1, 1, 1, 1])
        # normalize the weights so they sum to 1.0
        weights = [float(w) / sum(weights) for w in weights]
        return (beer, weights)
        
    def predict(self, data):
        beer, weights = data
        results = []
        for beer_cmp in self.beers:
            if beer!=beer_cmp:
                dist = calc_distance(self.simple_distances, beer, beer_cmp, weights)
                results.append((beer, beer_cmp, dist))
        return sorted(results, key=lambda x: x[2])

## Deploy to Yhat

In [24]:
yh = Yhat("myles.gartland@rockhurst.edu", "e68038ecb98e3dcc0f9f78a5c2467aeb","http://cloud.yhathq.com/")
br = BeerRec(simple_distances=simple_distances, beers=beers,
             udfs=[calc_distance])
#yh.deploy("PydataBeerRec_test", br, )
yh.deploy("PydataBeerRec3", BeerRec, globals())

Are you sure you want to deploy? (y/N): y


{u'lang': u'python',
 u'message': u'Your model has been uploaded and is currently being built. You can check the status by logging into your Yhat account and viewing: /model/PydataBeerRec3/',
 u'model_endpoint': u'/myles.gartland@rockhurst.edu/models/PydataBeerRec3/',
 u'modelname': u'PydataBeerRec3',
 u'status': u'success',
 u'timestamp': u'20141126032237',
 u'version': 5}

## Test it Out

In [23]:
yh.predict("PydataBeerRec", 1, {"beer": "Coors Light", "weights": [1, 1, 1, 1]})

HTTPError: HTTP Error 400: Bad Request

In [25]:
yh.predict("PydataBeerRec", 1, {"beer": "Coors Light", "weights": [2, 1, 0, 0]})

[[u'Coors Light', u'Natural Light', 14.79369],
 [u'Coors Light', u'Michelob Ultra', 15.81099],
 [u'Coors Light', u'Bud Light', 21.75517],
 [u'Coors Light', u'Blue Moon Belgian White', 34.41245],
 [u'Coors Light', u'Fat Tire Amber Ale', 35.19777],
 [u'Coors Light', u"Dale's Pale Ale", 35.41338],
 [u'Coors Light', u'Guinness Draught', 40.43252],
 [u'Coors Light', u'60 Minute IPA', 45.5498],
 [u'Coors Light', u'Sierra Nevada Pale Ale', 49.73314]]